From 359ad198842f37071efc41a2c1cfda399cfbc733 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 16 Sep 2020 10:54:30 -0400 Subject: [PATCH] Added float_array support Added fix, tests to come --- .../engine/datasetops/source/album_op.cc | 115 ++++++++++++++---- .../engine/datasetops/source/album_op.h | 14 +++ tests/ut/cpp/dataset/album_op_test.cc | 6 +- tests/ut/data/dataset/testAlbum/gen_json.py | 2 +- 4 files changed, 108 insertions(+), 29 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc index 79a7012f2b..df40ea0446 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc @@ -88,11 +88,16 @@ AlbumOp::AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir } // Helper function for string comparison +// album sorts the files via numerical values, so this is not a simple string comparison bool StrComp(const std::string &a, const std::string &b) { - // returns 1 if string a is alphabetically + // returns 1 if string a represent a numeric value // less than string b // quite similar to strcmp operation - return a < b; + // the following will always return name, provided there is only one "." character in name + // "." character is guranteed since the extension is checked befor this function call. + int64_t value_a = std::atoi(a.substr(1, a.find(".")).c_str()); + int64_t value_b = std::atoi(b.substr(1, b.find(".")).c_str()); + return value_a < value_b; } // Single thread to go through the folder directory and gets all file names @@ -137,7 +142,7 @@ Status AlbumOp::operator()() { while (sampler_buffer->eoe() == false) { TensorRow sample_row; RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row)); - std::shared_ptr sample_ids = sample_row[0]; + TensorPtr sample_ids = sample_row[0]; for (auto itr = sample_ids->begin(); itr != sample_ids->end(); ++itr) { if ((*itr) >= num_rows_) continue; // index out of bound, skipping keys.push_back(*itr); @@ -253,7 +258,7 @@ Status AlbumOp::CheckImageType(const std::string &file_name, bool *valid) { } Status AlbumOp::LoadImageTensor(const std::string &image_file_path, uint32_t col_num, TensorRow *row) { - std::shared_ptr image; + TensorPtr image; std::ifstream fs; fs.open(image_file_path, std::ios::binary | std::ios::in); if (fs.fail()) { @@ -283,7 +288,7 @@ Status AlbumOp::LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t c std::vector data = json_obj; MS_LOG(INFO) << "String array label found: " << data << "."; - std::shared_ptr label; + TensorPtr label; RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); row->push_back(std::move(label)); return Status::OK(); @@ -294,16 +299,16 @@ Status AlbumOp::LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_nu // now we iterate over the elements in json MS_LOG(INFO) << "String label found: " << data << "."; - std::shared_ptr label; + TensorPtr label; RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &label)); row->push_back(std::move(label)); return Status::OK(); } Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { - std::shared_ptr label; + TensorPtr label; // consider templating this function to handle all ints - if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT64)) { + if (data_schema_->column(col_num).type() == DataType::DE_INT64) { std::vector data; // Iterate over the integer list and add those values to the output shape tensor @@ -312,7 +317,7 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_ (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); - } else if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT32)) { + } else if (data_schema_->column(col_num).type() == DataType::DE_INT32) { std::vector data; // Iterate over the integer list and add those values to the output shape tensor @@ -320,7 +325,6 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_ using it_type = decltype(items.begin()); (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); - MS_LOG(INFO) << "Int array found: " << data << "."; RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); } else { RETURN_STATUS_UNEXPECTED("Invalid data, column type is neither int32 nor int64, it is " + @@ -330,16 +334,45 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_ return Status::OK(); } +Status AlbumOp::LoadFloatArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { + TensorPtr float_array; + // consider templating this function to handle all ints + if (data_schema_->column(col_num).type() == DataType::DE_FLOAT64) { + std::vector data; + + // Iterate over the integer list and add those values to the output shape tensor + auto items = json_obj.items(); + using it_type = decltype(items.begin()); + (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); + + RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &float_array)); + } else if (data_schema_->column(col_num).type() == DataType::DE_FLOAT32) { + std::vector data; + + // Iterate over the integer list and add those values to the output shape tensor + auto items = json_obj.items(); + using it_type = decltype(items.begin()); + (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); + + RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &float_array)); + } else { + RETURN_STATUS_UNEXPECTED("Invalid data, column type is neither float32 nor float64, it is " + + data_schema_->column(col_num).type().ToString()); + } + row->push_back(std::move(float_array)); + return Status::OK(); +} + Status AlbumOp::LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row) { - if (data_schema_->column(col_num).type() == DataType(DataType::DE_STRING)) { - std::shared_ptr id; + if (data_schema_->column(col_num).type() == DataType::DE_STRING) { + TensorPtr id; RETURN_IF_NOT_OK(Tensor::CreateScalar(file, &id)); row->push_back(std::move(id)); return Status::OK(); } // hack to get the file name without extension, the 1 is to get rid of the backslash character int64_t image_id = std::atoi(file.substr(1, file.find(".")).c_str()); - std::shared_ptr id; + TensorPtr id; RETURN_IF_NOT_OK(Tensor::CreateScalar(image_id, &id)); MS_LOG(INFO) << "File ID " << image_id << "."; row->push_back(std::move(id)); @@ -348,7 +381,7 @@ Status AlbumOp::LoadIDTensor(const std::string &file, uint32_t col_num, TensorRo Status AlbumOp::LoadEmptyTensor(uint32_t col_num, TensorRow *row) { // hack to get the file name without extension, the 1 is to get rid of the backslash character - std::shared_ptr empty_tensor; + TensorPtr empty_tensor; RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({}), data_schema_->column(col_num).type(), &empty_tensor)); row->push_back(std::move(empty_tensor)); return Status::OK(); @@ -359,12 +392,12 @@ Status AlbumOp::LoadEmptyTensor(uint32_t col_num, TensorRow *row) { // Float64 doesn't work with reinterpret cast here. Otherwise we limit the float in the schema to // only be float32, seems like a weird limitation to impose Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { - std::shared_ptr float_tensor; - if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT64)) { + TensorPtr float_tensor; + if (data_schema_->column(col_num).type() == DataType::DE_FLOAT64) { double data = json_obj; MS_LOG(INFO) << "double found: " << json_obj << "."; RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &float_tensor)); - } else if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT32)) { + } else if (data_schema_->column(col_num).type() == DataType::DE_FLOAT32) { float data = json_obj; RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &float_tensor)); MS_LOG(INFO) << "float found: " << json_obj << "."; @@ -373,9 +406,27 @@ Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num return Status::OK(); } -// Load 1 TensorRow (image,label) using 1 ImageColumns. 1 function call produces 1 TensorTow in a DataBuffer +// Loads a tensor with int value, we have to cast the value to type specified in the schema. +Status AlbumOp::LoadIntTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { + TensorPtr int_tensor; + if (data_schema_->column(col_num).type() == DataType::DE_INT64) { + int64_t data = json_obj; + MS_LOG(INFO) << "int64 found: " << json_obj << "."; + RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &int_tensor)); + } else if (data_schema_->column(col_num).type() == DataType::DE_INT32) { + int32_t data = json_obj; + RETURN_IF_NOT_OK(Tensor::CreateScalar(data, &int_tensor)); + MS_LOG(INFO) << "int32 found: " << json_obj << "."; + } + row->push_back(std::move(int_tensor)); + return Status::OK(); +} + +// Load 1 TensorRow (image,label) using 1 ImageColumns. 1 function call produces 1 TensorRow in a DataBuffer // possible optimization: the helper functions of LoadTensorRow should be optimized // to take a reference to a column descriptor? +// the design of this class is to make the code more readable, forgoing minor perfomance gain like +// getting rid of duplicated checks Status AlbumOp::LoadTensorRow(const std::string &file, TensorRow *row) { // testing here is to just print out file path (*row) = {}; @@ -414,30 +465,42 @@ Status AlbumOp::LoadTensorRow(const std::string &file, TensorRow *row) { MS_LOG(INFO) << "This column is: " << data_schema_->column(i).name() << "."; bool is_array = column_value.is_array(); // load single string - if (column_value.is_string() && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) { + if (column_value.is_string() && data_schema_->column(i).type() == DataType::DE_STRING) { RETURN_IF_NOT_OK(LoadStringTensor(column_value, i, row)); continue; } // load string array - if (is_array && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) { + if (is_array && data_schema_->column(i).type() == DataType::DE_STRING) { RETURN_IF_NOT_OK(LoadStringArrayTensor(column_value, i, row)); continue; } // load image file - if (column_value.is_string() && data_schema_->column(i).type() != DataType(DataType::DE_STRING)) { + if (column_value.is_string() && data_schema_->column(i).type() != DataType::DE_STRING) { std::string image_file_path = column_value; RETURN_IF_NOT_OK(LoadImageTensor(image_file_path, i, row)); continue; } - // load float array - if (!is_array && (data_schema_->column(i).type() == DataType(DataType::DE_FLOAT32) || - data_schema_->column(i).type() == DataType(DataType::DE_FLOAT64))) { + // load float value + if (!is_array && (data_schema_->column(i).type() == DataType::DE_FLOAT32 || + data_schema_->column(i).type() == DataType::DE_FLOAT64)) { RETURN_IF_NOT_OK(LoadFloatTensor(column_value, i, row)); continue; } + // load float array + if (is_array && (data_schema_->column(i).type() == DataType::DE_FLOAT32 || + data_schema_->column(i).type() == DataType::DE_FLOAT64)) { + RETURN_IF_NOT_OK(LoadFloatArrayTensor(column_value, i, row)); + continue; + } + // int value + if (!is_array && (data_schema_->column(i).type() == DataType::DE_INT64 || + data_schema_->column(i).type() == DataType::DE_INT32)) { + RETURN_IF_NOT_OK(LoadIntTensor(column_value, i, row)); + continue; + } // int array - if (is_array && (data_schema_->column(i).type() == DataType(DataType::DE_INT64) || - data_schema_->column(i).type() == DataType(DataType::DE_INT32))) { + if (is_array && (data_schema_->column(i).type() == DataType::DE_INT64 || + data_schema_->column(i).type() == DataType::DE_INT32)) { RETURN_IF_NOT_OK(LoadIntArrayTensor(column_value, i, row)); continue; } else { diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h index 3e28e1b6db..28020bdd2c 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h @@ -220,6 +220,13 @@ class AlbumOp : public ParallelOp, public RandomAccessOp { /// \return Status The error code return Status LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + /// \brief Load vector of floatss to tensor, append tensor to tensor row + /// \param[in] json_obj Json object containing array data + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadFloatArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + /// \brief Load string array into a tensor, append tensor to tensor row /// \param[in] json_obj Json object containing string tensor /// \param[in] col_num Column num in schema @@ -241,6 +248,13 @@ class AlbumOp : public ParallelOp, public RandomAccessOp { /// \return Status The error code return Status LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + /// \brief Load int value to tensor row + /// \param[in] json_obj Json object containing int + /// \param[in] col_num Column num in schema + /// \param[inout] row Tensor row to push to + /// \return Status The error code return + Status LoadIntTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); + /// \brief Load emtpy tensor to tensor row /// \param[in] col_num Column num in schema /// \param[inout] row Tensor row to push to diff --git a/tests/ut/cpp/dataset/album_op_test.cc b/tests/ut/cpp/dataset/album_op_test.cc index fcd81ed19e..b921dd04ea 100644 --- a/tests/ut/cpp/dataset/album_op_test.cc +++ b/tests/ut/cpp/dataset/album_op_test.cc @@ -192,12 +192,14 @@ TEST_F(MindDataTestAlbum, TestSequentialAlbumWithFullSchema) { uint64_t i = 0; int32_t label = 0; double priority = 0; + int64_t id = 0; while (tensor_map.size() != 0) { tensor_map["label"]->GetItemAt(&label, {}); tensor_map["_priority"]->GetItemAt(&priority, {}); - MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" + tensor_map["id"]->GetItemAt(&id, {}); + MS_LOG(ERROR) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" << tensor_map["label"] << "priority: " << priority << " embedding : " << - tensor_map["_embedding"]->shape() << "\n"; + tensor_map["_embedding"]->shape() << " id: " << id << "\n"; i++; di.GetNextAsMap(&tensor_map); } diff --git a/tests/ut/data/dataset/testAlbum/gen_json.py b/tests/ut/data/dataset/testAlbum/gen_json.py index 3b74180588..dd2454a2da 100644 --- a/tests/ut/data/dataset/testAlbum/gen_json.py +++ b/tests/ut/data/dataset/testAlbum/gen_json.py @@ -15,7 +15,7 @@ if __name__ == '__main__': default_dict.update(dataset='') default_dict.update(image=os.path.abspath(os.path.join(DIRECTORY, filename))) default_dict.update(label=['3', '2']) - default_dict.update(_priority=0.8) + default_dict.update(_priority=[0.8, 0.3]) default_dict.update(_embedding=os.path.abspath(os.path.join(PARENT_DIR, 'sample.bin'))) default_dict.update(_processed_image=os.path.abspath(os.path.join(DIRECTORY, filename))) i = i + 1