diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index d43375a692..c48114e65c 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -37,6 +37,7 @@ #endif // Dataset operator headers (in alphabetical order) #include "minddata/dataset/engine/datasetops/batch_op.h" +#include "minddata/dataset/engine/datasetops/bucket_batch_by_length_op.h" #include "minddata/dataset/engine/datasetops/build_vocab_op.h" #include "minddata/dataset/engine/datasetops/concat_op.h" #include "minddata/dataset/engine/datasetops/map_op/map_op.h" @@ -280,6 +281,25 @@ std::shared_ptr Dataset::Batch(int32_t batch_size, bool drop_remai return ds; } +// Function to create a BucketBatchByLength dataset +std::shared_ptr Dataset::BucketBatchByLength( + const std::vector &column_names, const std::vector &bucket_boundaries, + const std::vector &bucket_batch_sizes, TensorRow (*element_length_function)(TensorRow), + const std::map>> &pad_info, bool pad_to_bucket_boundary, + bool drop_remainder) { + auto ds = std::make_shared(column_names, bucket_boundaries, bucket_batch_sizes, + element_length_function, pad_info, pad_to_bucket_boundary, + drop_remainder); + + if (!ds->ValidateParams()) { + return nullptr; + } + + ds->children.push_back(shared_from_this()); + + return ds; +} + #ifndef ENABLE_ANDROID // Function to create a Vocab from dataset std::shared_ptr Dataset::BuildVocab(const std::vector &columns, @@ -1590,6 +1610,79 @@ bool BatchDataset::ValidateParams() { return true; } +BucketBatchByLengthDataset::BucketBatchByLengthDataset( + const std::vector &column_names, const std::vector &bucket_boundaries, + const std::vector &bucket_batch_sizes, TensorRow (*element_length_function)(TensorRow), + const std::map>> &pad_info, bool pad_to_bucket_boundary, + bool drop_remainder) + : column_names_(column_names), + bucket_boundaries_(bucket_boundaries), + bucket_batch_sizes_(bucket_batch_sizes), + element_length_function_(element_length_function), + pad_info_(pad_info), + pad_to_bucket_boundary_(pad_to_bucket_boundary), + drop_remainder_(drop_remainder) {} + +std::vector> BucketBatchByLengthDataset::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + std::shared_ptr c_func; + if (element_length_function_ != nullptr) { + c_func = std::make_shared(element_length_function_); + } else { + c_func = nullptr; + } + node_ops.push_back(std::make_shared(column_names_, bucket_boundaries_, bucket_batch_sizes_, + c_func, pad_info_, pad_to_bucket_boundary_, + drop_remainder_, connector_que_size_)); + return node_ops; +} + +bool BucketBatchByLengthDataset::ValidateParams() { + if (element_length_function_ == nullptr && column_names_.size() != 1) { + MS_LOG(ERROR) << "BucketBatchByLength: If element_length_function is not specified, exactly one column name " + "should be passed."; + return false; + } + + // Check bucket_boundaries: must be positive and strictly increasing + if (bucket_boundaries_.empty()) { + MS_LOG(ERROR) << "BucketBatchByLength: bucket_boundaries cannot be empty."; + return false; + } + for (int i = 0; i < bucket_boundaries_.size(); i++) { + if (bucket_boundaries_[i] <= 0) { + MS_LOG(ERROR) + << "BucketBatchByLength: bucket_boundaries must only contain positive numbers. However, the element at index: " + << i << " was: " << bucket_boundaries_[i]; + return false; + } + if (i > 0 && bucket_boundaries_[i - 1] >= bucket_boundaries_[i]) { + MS_LOG(ERROR) + << "BucketBatchByLength: bucket_boundaries must be strictly increasing. However, the elements at index: " + << i - 1 << " and " << i << " were: " << bucket_boundaries_[i - 1] << " and " << bucket_boundaries_[i] + << " respectively."; + return false; + } + } + + // Check bucket_batch_sizes: must be positive + if (bucket_batch_sizes_.empty()) { + MS_LOG(ERROR) << "BucketBatchByLength: bucket_batch_sizes must be non-empty"; + return false; + } + if (bucket_batch_sizes_.size() != bucket_boundaries_.size() + 1) { + MS_LOG(ERROR) << "BucketBatchByLength: bucket_batch_sizes's size must equal the size of bucket_boundaries + 1"; + return false; + } + if (std::any_of(bucket_batch_sizes_.begin(), bucket_batch_sizes_.end(), [](int i) { return i <= 0; })) { + MS_LOG(ERROR) << "BucketBatchByLength: bucket_batch_sizes must only contain positive numbers."; + return false; + } + return true; +} + #ifndef ENABLE_ANDROID BuildVocabDataset::BuildVocabDataset(std::shared_ptr vocab, const std::vector &columns, const std::pair &freq_range, int64_t top_k, diff --git a/mindspore/ccsrc/minddata/dataset/api/python/de_pipeline.cc b/mindspore/ccsrc/minddata/dataset/api/python/de_pipeline.cc index 4b10aa5057..443c669f4a 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/de_pipeline.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/de_pipeline.cc @@ -952,7 +952,9 @@ Status DEPipeline::ParseBucketBatchByLengthOp(const py::dict &args, std::shared_ (void)builder->SetBucketBatchSizes(ToIntVector(value)); } if (key == "element_length_function") { - (void)builder->SetElementLengthFunction(value.cast()); + std::shared_ptr py_func; + py_func = std::make_shared(value.cast(), DataType::DE_INT32); + (void)builder->SetElementLengthFunction(py_func); } if (key == "pad_info") { PadInfo pad_info; diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc index 971f14c669..9f4866f92e 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc @@ -85,7 +85,7 @@ Status BucketBatchByLengthOp::Builder::Build(std::shared_ptr length_dependent_columns, std::vector bucket_boundaries, std::vector bucket_batch_sizes, - py::function element_length_function, PadInfo pad_info, + std::shared_ptr element_length_function, PadInfo pad_info, bool pad_to_bucket_boundary, bool drop_remainder, int32_t op_connector_size) : PipelineOp(op_connector_size), @@ -155,34 +155,15 @@ Status BucketBatchByLengthOp::ObtainElementLength(int32_t *out_element_length, T // call pyfunc here if given pyfunc, otherwise return 0th dimension of shape of // the single column specified in length_dependent_columns_ if (element_length_function_) { - py::gil_scoped_acquire gil_acquire; - if (Py_IsInitialized() == 0) { - return Status(StatusCode::kPythonInterpreterFailure, "Python Interpreter is finalized"); - } - try { - size_t number_of_arguments = length_dependent_columns_.size(); - py::tuple input_arguments(number_of_arguments); - for (size_t i = 0; i < number_of_arguments; i++) { - py::array argument_value; - int32_t column_index = column_name_id_map_[length_dependent_columns_[i]]; - RETURN_IF_NOT_OK(element[column_index]->GetDataAsNumpy(&argument_value)); - input_arguments[i] = argument_value; - } - - py::object length = element_length_function_(*input_arguments); - *out_element_length = length.cast(); - if (*out_element_length < 0) { - return Status(StatusCode::kPyFuncException, "Element length function should return a non negative integer."); - } - } catch (const py::error_already_set &e) { - return Status(StatusCode::kPyFuncException, e.what()); - } catch (const py::cast_error &e) { - return Status(StatusCode::kPyFuncException, "Count not cast output of element length function to int32_t."); + TensorRow output; + RETURN_IF_NOT_OK(element_length_function_->Compute(element, &output)); + RETURN_IF_NOT_OK(output.at(0)->GetItemAt(out_element_length, {0})); + if (*out_element_length < 0) { + RETURN_STATUS_UNEXPECTED("BucketBatchByLength: element_length_function returned negative integer"); } } else { *out_element_length = element[0]->shape()[0]; } - return Status::OK(); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.h index e14a5ff760..fac40a7955 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.h @@ -27,6 +27,7 @@ #include "minddata/dataset/engine/dataset_iterator.h" #include "minddata/dataset/engine/datasetops/batch_op.h" #include "minddata/dataset/engine/datasetops/pipeline_op.h" +#include "minddata/dataset/kernels/tensor_op.h" #include "minddata/dataset/util/status.h" namespace mindspore { @@ -57,7 +58,7 @@ class BucketBatchByLengthOp : public PipelineOp { return *this; } - Builder &SetElementLengthFunction(py::function element_length_function) { + Builder &SetElementLengthFunction(std::shared_ptr element_length_function) { builder_element_length_function_ = element_length_function; return *this; } @@ -90,7 +91,7 @@ class BucketBatchByLengthOp : public PipelineOp { std::vector builder_length_dependent_columns_; std::vector builder_bucket_boundaries_; std::vector builder_bucket_batch_sizes_; - py::function builder_element_length_function_; + std::shared_ptr builder_element_length_function_; PadInfo builder_pad_info_; bool builder_pad_to_bucket_boundary_; bool builder_drop_remainder_; @@ -98,8 +99,8 @@ class BucketBatchByLengthOp : public PipelineOp { }; BucketBatchByLengthOp(std::vector length_dependent_columns, std::vector bucket_boundaries, - std::vector bucket_batch_sizes, py::function element_length_function, PadInfo pad_info, - bool pad_to_bucket_boundary, bool drop_remainder, int32_t op_connector_size); + std::vector bucket_batch_sizes, std::shared_ptr element_length_function, + PadInfo pad_info, bool pad_to_bucket_boundary, bool drop_remainder, int32_t op_connector_size); // Destructor ~BucketBatchByLengthOp() = default; @@ -137,7 +138,7 @@ class BucketBatchByLengthOp : public PipelineOp { std::vector length_dependent_columns_; std::vector bucket_boundaries_; std::vector bucket_batch_sizes_; - py::function element_length_function_; + std::shared_ptr element_length_function_; PadInfo pad_info_; bool pad_to_bucket_boundary_; bool drop_remainder_; diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index f8ba200064..41dc999367 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -30,6 +30,8 @@ #include "minddata/dataset/include/iterator.h" #include "minddata/dataset/include/samplers.h" #include "minddata/dataset/include/type_id.h" +#include "minddata/dataset/kernels/c_func_op.h" +#include "minddata/dataset/kernels/tensor_op.h" #ifndef ENABLE_ANDROID #include "minddata/dataset/text/vocab.h" #endif @@ -72,6 +74,7 @@ class VOCDataset; #endif // Dataset Op classes (in alphabetical order) class BatchDataset; +class BucketBatchByLengthDataset; #ifndef ENABLE_ANDROID class BuildVocabDataset; #endif @@ -370,6 +373,35 @@ class Dataset : public std::enable_shared_from_this { /// \return Shared pointer to the current BatchDataset std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false); + /// \brief Function to create a BucketBatchByLengthDataset + /// \notes Combines batch_size number of consecutive rows into batches + /// \param[in] column_names Columns passed to element_length_function + /// \param[in] bucket_boundaries A list consisting of the upper boundaries of the buckets. + /// Must be strictly increasing. If there are n boundaries, n+1 buckets are created: One bucket for + /// [0, bucket_boundaries[0]), one bucket for [bucket_boundaries[i], bucket_boundaries[i+1]) for each + /// 0 BucketBatchByLength( + const std::vector &column_names, const std::vector &bucket_boundaries, + const std::vector &bucket_batch_sizes, TensorRow (*element_length_function)(TensorRow) = nullptr, + const std::map>> &pad_info = {}, + bool pad_to_bucket_boundary = false, bool drop_remainder = false); + #ifndef ENABLE_ANDROID /// \brief Function to create a Vocab from source dataset /// \notes Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab @@ -953,6 +985,36 @@ class BatchDataset : public Dataset { std::map>> pad_map_; }; +class BucketBatchByLengthDataset : public Dataset { + public: + /// \brief Constructor + BucketBatchByLengthDataset( + const std::vector &column_names, const std::vector &bucket_boundaries, + const std::vector &bucket_batch_sizes, TensorRow (*element_length_function)(TensorRow) = nullptr, + const std::map>> &pad_info = {}, + bool pad_to_bucket_boundary = false, bool drop_remainder = false); + + /// \brief Destructor + ~BucketBatchByLengthDataset() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return bool true if all the params are valid + bool ValidateParams() override; + + private: + std::vector column_names_; + std::vector bucket_boundaries_; + std::vector bucket_batch_sizes_; + TensorRow (*element_length_function_)(TensorRow); + std::map>> pad_info_; + bool pad_to_bucket_boundary_; + bool drop_remainder_; +}; + #ifndef ENABLE_ANDROID class BuildVocabDataset : public Dataset { public: diff --git a/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt index 3273822db5..3e5d3459c2 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt @@ -7,6 +7,7 @@ if (ENABLE_PYTHON) data/compose_op.cc data/random_apply_op.cc data/random_choice_op.cc + c_func_op.cc py_func_op.cc tensor_op.cc) target_include_directories(kernels PRIVATE ${pybind11_INCLUDE_DIRS}) diff --git a/mindspore/ccsrc/minddata/dataset/kernels/c_func_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/c_func_op.cc new file mode 100644 index 0000000000..3ac948691d --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/kernels/c_func_op.cc @@ -0,0 +1,37 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "minddata/dataset/kernels/c_func_op.h" +#include "minddata/dataset/kernels/tensor_op.h" +#include "minddata/dataset/util/status.h" + +namespace mindspore { +namespace dataset { +Status CFuncOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + Status ret = Status(StatusCode::kOK, "CFunc Call Succeed"); + try { + *output = (*c_func_ptr_)(input); + } catch (const std::exception &e) { + RETURN_STATUS_UNEXPECTED("Unexpected error in CFuncOp"); + } + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/kernels/c_func_op.h b/mindspore/ccsrc/minddata/dataset/kernels/c_func_op.h new file mode 100644 index 0000000000..184115c330 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/kernels/c_func_op.h @@ -0,0 +1,50 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_C_FUNC_OP_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_C_FUNC_OP_H_ + +#include +#include +#include +#include + +#include "minddata/dataset/core/tensor.h" +#include "minddata/dataset/kernels/tensor_op.h" + +namespace mindspore { +namespace dataset { +class CFuncOp : public TensorOp { + public: + explicit CFuncOp(TensorRow (*func)(TensorRow)) : c_func_ptr_(func) {} + + ~CFuncOp() override = default; + + uint32_t NumInput() override { return 0; } + uint32_t NumOutput() override { return 0; } + + // Calls c_func_ptr and returns the result + Status Compute(const TensorRow &input, TensorRow *output) override; + + std::string Name() const override { return kCFuncOp; } + + private: + TensorRow (*c_func_ptr_)(TensorRow); +}; +} // namespace dataset +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_C_FUNC_OP_H_ diff --git a/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.cc index dbf2dfe73e..2ea26ab7b7 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.cc +++ b/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.cc @@ -37,35 +37,44 @@ Status PyFuncOp::Compute(const TensorRow &input, TensorRow *output) { try { // Transform input tensor vector into numpy array vector py::tuple input_args(input.size()); - for (size_t i = 0; i < input.size(); i++) { - py::array new_data; - RETURN_IF_NOT_OK(input.at(i)->GetDataAsNumpy(&new_data)); - // possible memcpy here - input_args[i] = new_data; + py::object ret_py_obj; + if (input.size() > 0) { + for (size_t i = 0; i < input.size(); i++) { + py::array new_data; + RETURN_IF_NOT_OK(input.at(i)->GetDataAsNumpy(&new_data)); + // possible memcpy here + input_args[i] = new_data; + } + // Invoke python function + ret_py_obj = this->py_func_ptr_(*input_args); + } else { + ret_py_obj = this->py_func_ptr_(); } - // Invoke python function - py::object ret_py_obj = this->py_func_ptr_(*input_args); - // Process the return value - if (py::isinstance(ret_py_obj)) { - // In case of a n-1 mapping, the return value will be a numpy array - std::shared_ptr out; - RETURN_IF_NOT_OK(Tensor::CreateFromNpArray(ret_py_obj.cast(), &out)); - output->push_back(out); - } else if (py::isinstance(ret_py_obj)) { - // In case of a n-m mapping, the return value will be a tuple of numpy arrays - py::tuple ret_py_tuple = ret_py_obj.cast(); - // Iterate over two containers simultaneously for memory copy - for (size_t i = 0; i < ret_py_tuple.size(); i++) { - py::object ret_py_ele = ret_py_tuple[i]; - if (!py::isinstance(ret_py_ele)) { - goto ShapeMisMatch; + if (output_type_ != DataType::DE_UNKNOWN) { + RETURN_IF_NOT_OK(CastOutput(ret_py_obj, output)); + + } else { + if (py::isinstance(ret_py_obj)) { + // In case of a n-m mapping, the return value will be a tuple of numpy arrays + py::tuple ret_py_tuple = ret_py_obj.cast(); + // Iterate over two containers simultaneously for memory copy + for (size_t i = 0; i < ret_py_tuple.size(); i++) { + py::object ret_py_ele = ret_py_tuple[i]; + if (!py::isinstance(ret_py_ele)) { + goto ShapeMisMatch; + } + std::shared_ptr out; + RETURN_IF_NOT_OK(Tensor::CreateFromNpArray(ret_py_ele.cast(), &out)); + output->push_back(out); } + } else if (py::isinstance(ret_py_obj)) { + // In case of a n-1 mapping, the return value will be a numpy array std::shared_ptr out; - RETURN_IF_NOT_OK(Tensor::CreateFromNpArray(ret_py_ele.cast(), &out)); + RETURN_IF_NOT_OK(Tensor::CreateFromNpArray(ret_py_obj.cast(), &out)); output->push_back(out); + } else { + goto ShapeMisMatch; } - } else { - goto ShapeMisMatch; } } catch (const py::error_already_set &e) { ret = Status(StatusCode::kPyFuncException, e.what()); @@ -79,5 +88,24 @@ ShapeMisMatch: ret = Status(StatusCode::kShapeMisMatch, "PyFunc should return a numpy array or a numpy array tuple"); goto ComputeReturn; } + +Status PyFuncOp::CastOutput(const py::object &ret_py_obj, TensorRow *output) { + try { + std::shared_ptr out; + switch (output_type_) { + case DataType::DE_INT32: + RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_INT32), &out)); + RETURN_IF_NOT_OK(out->SetItemAt({0}, ret_py_obj.cast())); + break; + default: + RETURN_STATUS_UNEXPECTED("No cast for the specified DataType was found."); + } + output->push_back(out); + } catch (const std::exception &e) { + return Status(StatusCode::kUnexpectedError, e.what()); + } + return Status::OK(); +} + } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.h b/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.h index d11437b490..c14c32355c 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.h +++ b/mindspore/ccsrc/minddata/dataset/kernels/py_func_op.h @@ -27,9 +27,11 @@ namespace mindspore { namespace dataset { -class __attribute__((visibility("hidden"))) PyFuncOp : public TensorOp { +class PyFuncOp : public TensorOp { public: - explicit PyFuncOp(py::function func) : py_func_ptr_(std::move(func)) {} + explicit PyFuncOp(py::function func) : py_func_ptr_(std::move(func)) { output_type_ = DataType::DE_UNKNOWN; } + explicit PyFuncOp(py::function func, DataType::Type output_type) + : py_func_ptr_(std::move(func)), output_type_(output_type) {} ~PyFuncOp() override = default; @@ -39,10 +41,18 @@ class __attribute__((visibility("hidden"))) PyFuncOp : public TensorOp { // Compute function for n-n mapping. Status Compute(const TensorRow &input, TensorRow *output) override; + /// \brief Function to convert a primitive type py::object to a TensorRow + /// \notes Changes the py::object to a tensor with corresponding C++ DataType based on output_type_ and adds it to a + /// TensorRow. This function is used inside Compute. + /// \param[in] ret_py_obj The python object we want to cast + /// \param[output] The TensorRow output + /// \return Status + Status CastOutput(const py::object &ret_py_obj, TensorRow *output); std::string Name() const override { return kPyFuncOp; } private: py::function py_func_ptr_; + DataType::Type output_type_; }; } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h index 771d1af150..06dba20cd6 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h +++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h @@ -127,6 +127,7 @@ constexpr char kToFloat16Op[] = "ToFloat16Op"; constexpr char kTypeCastOp[] = "TypeCastOp"; // other +constexpr char kCFuncOp[] = "CFuncOp"; constexpr char kPyFuncOp[] = "PyFuncOp"; constexpr char kNoOp[] = "NoOp"; diff --git a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc index 818189e6e2..57806837e2 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc @@ -14,6 +14,7 @@ * limitations under the License. */ #include "common/common.h" +#include "minddata/dataset/core/tensor_row.h" #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/transforms.h" @@ -24,6 +25,16 @@ class MindDataTestPipeline : public UT::DatasetOpTesting { protected: }; +mindspore::dataset::TensorRow BucketBatchTestFunction(mindspore::dataset::TensorRow input) { + mindspore::dataset::TensorRow output; + std::shared_ptr out; + Tensor::CreateEmpty(mindspore::dataset::TensorShape({1}), + mindspore::dataset::DataType(mindspore::dataset::DataType::Type::DE_INT32), &out); + out->SetItemAt({0}, 2); + output.push_back(out); + return output; +} + TEST_F(MindDataTestPipeline, TestBatchAndRepeat) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBatchAndRepeat."; @@ -47,7 +58,7 @@ TEST_F(MindDataTestPipeline, TestBatchAndRepeat) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -65,6 +76,183 @@ TEST_F(MindDataTestPipeline, TestBatchAndRepeat) { iter->Stop(); } +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthSuccess1) { + // Calling with default values + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthSuccess1."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image"}, {1, 2, 3}, {4, 5, 6, 7}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // iterate over the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + iter->GetNextRow(&row); + } + // 2 batches of size 5 + EXPECT_EQ(i, 2); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthSuccess2) { + // Calling with non-default values + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthSuccess2."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + std::map>> pad_info; + ds = ds->BucketBatchByLength({"image"}, {1, 2}, {1, 2, 3}, + &BucketBatchTestFunction, pad_info, true, true); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate over the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + while (row.size() != 0) { + i++; + auto image = row["image"]; + MS_LOG(INFO) << "Tensor image shape: " << image->shape(); + iter->GetNextRow(&row); + } + // 5 batches of size 2 + EXPECT_EQ(i, 5); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail1) { + // Empty bucket_boundaries + // Calling with function pointer + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthFail1."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image"}, {}, {1}); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail2) { + // Empty bucket_batch_sizes + // Calling with function pointer + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthFail2."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image"}, {1}, {}); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail3) { + // Negative boundaries + // Calling with function pointer + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthFail3."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image"}, {-1, 1}, {1, 2, 3}); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail4) { + // Boundaries not strictly increasing + // Calling with function pointer + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthFail4."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image"}, {2, 2}, {1, 2, 3}); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail5) { + // Incorrect size of bucket_batch_size + // Calling with function pointer + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthFail5."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image"}, {1, 2}, {1, 2}); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail6) { + // Negative bucket_batch_size + // Calling with function pointer + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthFail6."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image"}, {1, 2}, {1, -2, 3}); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail7) { + // This should fail because element_length_function is not specified and column_names has more than 1 element. + // Calling with function pointer + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBucketBatchByLengthFail7."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a BucketBatchByLength operation on ds + ds = ds->BucketBatchByLength({"image", "label"}, {1, 2}, {1, 2, 3}); + EXPECT_EQ(ds, nullptr); +} TEST_F(MindDataTestPipeline, TestConcatFail1) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestConcatFail1."; @@ -148,7 +336,7 @@ TEST_F(MindDataTestPipeline, TestConcatSuccess) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); uint64_t i = 0; @@ -200,7 +388,7 @@ TEST_F(MindDataTestPipeline, TestConcatSuccess2) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); uint64_t i = 0; @@ -239,7 +427,7 @@ TEST_F(MindDataTestPipeline, TestImageFolderBatchAndRepeat) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -293,7 +481,7 @@ TEST_F(MindDataTestPipeline, TestProjectMap) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -366,7 +554,7 @@ TEST_F(MindDataTestPipeline, TestProjectMapAutoInjection) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -468,7 +656,7 @@ TEST_F(MindDataTestPipeline, TestRenameSuccess) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -514,7 +702,7 @@ TEST_F(MindDataTestPipeline, TestRepeatDefault) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map > row; iter->GetNextRow(&row); uint64_t i = 0; @@ -557,7 +745,7 @@ TEST_F(MindDataTestPipeline, TestRepeatOne) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map > row; iter->GetNextRow(&row); uint64_t i = 0; @@ -630,7 +818,7 @@ TEST_F(MindDataTestPipeline, TestShuffleDataset) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -666,7 +854,7 @@ TEST_F(MindDataTestPipeline, TestSkipDataset) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -718,7 +906,7 @@ TEST_F(MindDataTestPipeline, TestTakeDatasetDefault) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -776,7 +964,7 @@ TEST_F(MindDataTestPipeline, TestTakeDatasetNormal) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -830,7 +1018,7 @@ TEST_F(MindDataTestPipeline, TestTensorOpsAndMap) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -938,7 +1126,7 @@ TEST_F(MindDataTestPipeline, TestZipSuccess) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); @@ -992,7 +1180,7 @@ TEST_F(MindDataTestPipeline, TestZipSuccess2) { std::shared_ptr iter = ds->CreateIterator(); EXPECT_NE(iter, nullptr); - // Iterate the dataset and get each row + // iterate over the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row);