!7653 [MD] C++ api SaveOp

Merge pull request !7653 from luoyang/c-api-pyfunc
mindspore-ci-bot 4 years ago committed by Gitee
commit 9d35fdc4e5

@ -105,6 +105,52 @@ std::shared_ptr<Iterator> Dataset::CreateIterator(std::vector<std::string> colum
return iter;
// Function to create the saver, which will build and launch the execution tree and save data
bool Dataset::Save(std::string dataset_path, int32_t num_files, std::string dataset_type) {
Status rc;
// Build and launch tree
auto ds = shared_from_this();
std::unique_ptr<RuntimeContext> runtime_context = std::make_unique<RuntimeContext>();
rc = runtime_context->Init();
if (rc.IsError()) {
MS_LOG(ERROR) << "CreateSaver failed." << rc;
return false;
// Get SaveToDisk consumer
auto consumer = std::make_unique<SaveToDisk>(dataset_path, num_files, dataset_type);
rc = consumer->ValidateParams();
if (rc.IsError()) {
MS_LOG(ERROR) << "CreateSaver failed." << rc;
return false;
SaveToDisk *consumer_ = consumer.get();
rc = consumer->Init(ds);
if (rc.IsError()) {
MS_LOG(ERROR) << "CreateSaver failed." << rc;
return false;
// Save data into file
rc = consumer_->Save();
if (rc.IsError()) {
MS_LOG(ERROR) << "Saver: Failed to save data into file. Error status: " << rc;
return false;
// Shut down the data pipeline
rc = runtime_context->Terminate();
if (rc.IsError()) {
MS_LOG(ERROR) << "Saver: Failed to shut down pipeline. Error status: " << rc;
return false;
return true;
// Constructor
Dataset::Dataset() {
// Fetch some default value from config manager

@ -46,8 +46,8 @@ bool Iterator::GetNextRow(TensorVec *row) {
// Shut down the data pipeline.
void Iterator::Stop() { runtime_context->Terminate(); }
//// Function to build and launch the execution tree.
// Function to build and launch the execution tree.
Status Iterator::BuildAndLaunchTree(std::shared_ptr<Dataset> ds) {
runtime_context = std::make_unique<RuntimeContext>();

@ -18,6 +18,7 @@
#include <memory>
#include <string>
#include <map>
#include <unordered_map>
#include <utility>
#include <vector>
@ -77,26 +78,50 @@ class IteratorConsumer : public TreeConsumer {
int32_t num_epochs_;
/// Consumer that iterates over the dataset and writes it to desk
class SaveToDesk : public TreeConsumer {
/// Consumer that iterates over the dataset and writes it to disk
class SaveToDisk : public TreeConsumer {
/// Constructor which will call the base class default constructor.
/// \param dataset_path path the the dataset
/// \param num_files number of files. Default to 1
/// \param dataset_type The format of the dataset. Default to "mindrecod".
explicit SaveToDesk(std::string dataset_path, int32_t num_files = 1, std::string dataset_type = "mindrecord")
explicit SaveToDisk(std::string dataset_path, int32_t num_files = 1, std::string dataset_type = "mindrecord")
: TreeConsumer(), dataset_path_(dataset_path), num_files_(num_files), dataset_type_(dataset_type) {}
/// Save the given dataset to MindRecord format on desk. This is a blocking method (i.e., after returning, all rows
/// would be written to desk)
/// \brief Parameters validation
/// \return Status Status::OK() if all the parameters are valid
Status ValidateParams();
/// Save the given dataset to MindRecord format on disk. This is a blocking method (i.e., after returning, all rows
/// would be written to disk)
/// \return Status error code
Status Save() { return Status(StatusCode::kNotImplementedYet, __LINE__, __FILE__, "Method is not implemented yet."); }
Status Save();
/// Method to return the name of the consumer
/// \return string
std::string Name() override { return "SaveToDisk"; }
template <typename T, typename S>
Status TransfromTensor(const unsigned char *src, const TensorShape &shape, const int64_t num_of_elements,
std::unique_ptr<T> *data, std::unique_ptr<std::vector<uint8_t>> *data_ptr,
std::unique_ptr<S> *s, bool need_convert = false);
Status FetchMetaFromTensorRow(const std::unordered_map<std::string, int32_t> &column_name_id_map,
const TensorRow &row, nlohmann::json *schema, std::vector<std::string> *index_fields);
Status FetchDataFromTensorRow(const TensorRow &row,
const std::unordered_map<std::string, int32_t> &column_name_id_map,
nlohmann::json *row_raw_data,
std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> *row_bin_data);
std::string dataset_path_;
int32_t num_files_;
std::string dataset_type_;
/// Consumer that iterates over the dataset and send it to a device
class ToDevice : public TreeConsumer {

@ -589,6 +589,22 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// \return Shared pointer to the Iterator
std::shared_ptr<Iterator> CreateIterator(std::vector<std::string> columns = {});
/// \brief Function to create a Saver to save the dynamic data processed by the dataset pipeline
/// \note Usage restrictions:
/// 1. Supported dataset formats: 'mindrecord' only
/// 2. To save the samples in order, set dataset's shuffle to false and num_files to 1.
/// 3. Before calling the function, do not use batch operator, repeat operator or data augmentation operators
/// with random attribute in map operator.
/// 4. Mindrecord does not support bool, uint64, multi-dimensional uint8(drop dimension) nor
/// multi-dimensional string.
/// \param[in] file_name Path to dataset file
/// \param[in] num_files Number of dataset files (default=1)
/// \param[in] file_type Dataset format (default="mindrecord")
/// \return Returns true if no error encountered else false
bool Save(std::string dataset_path, int32_t num_files = 1, std::string dataset_type = "mindrecord");
/// \brief Function to create a BatchNode
/// \notes Combines batch_size number of consecutive rows into batches
/// \param[in] batch_size Path to the root directory that contains the dataset

@ -0,0 +1,148 @@
* Copyright 2020 Huawei Technologies Co., Ltd
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
#include <stdio.h>
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/transforms.h"
using namespace mindspore::dataset::api;
using mindspore::dataset::Tensor;
class MindDataTestPipeline : public UT::DatasetOpTesting {
TEST_F(MindDataTestPipeline, TestSaveCifar10AndLoad) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSaveCifar10AndLoad(single mindrecord file).";
// Stage 1: load original dataset
// Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", SequentialSampler(0, 10));
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
std::vector<std::shared_ptr<Tensor>> original_data;
// Save original data for comparison
uint64_t i = 0;
while (row.size() != 0) {
auto label = row["label"];
MS_LOG(INFO) << "Tensor label: " << *label;
// Expect 10 samples
EXPECT_EQ(i, 10);
// Manually terminate the pipeline
// Stage 2: Save data processed by the dataset pipeline
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::string temp_file = datasets_root_path_ + "/testCifar10Data/mind.mind";
std::string temp_file_db = datasets_root_path_ + "/testCifar10Data/mind.mind.db";
bool rc = ds->Save(temp_file);
EXPECT_EQ(rc, true);
// Stage 3: Load dataset from file output by stage 2
// Create a MindData Dataset
std::shared_ptr<Dataset> ds_minddata = MindData(temp_file, {}, SequentialSampler(0, 10));
// Create objects for the tensor ops
// uint32 will be casted to int64 implicitly in mindrecord file, so we have to cast it back to uint32
std::shared_ptr<TensorOperation> type_cast = transforms::TypeCast("uint32");
EXPECT_NE(type_cast, nullptr);
// Create a Map operation on ds
ds_minddata = ds_minddata->Map({type_cast}, {"label"});
EXPECT_NE(ds_minddata, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter_minddata = ds_minddata->CreateIterator();
EXPECT_NE(iter_minddata, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row_minddata;
// Check column name for each row
EXPECT_NE(row_minddata.find("image"), row_minddata.end());
EXPECT_NE(row_minddata.find("label"), row_minddata.end());
// Expect the output data is same with original_data
uint64_t j = 0;
while (row_minddata.size() != 0) {
auto label = row_minddata["label"];
EXPECT_EQ(*original_data[j], *label);
MS_LOG(INFO) << "Tensor label: " << *label;
// Expect 10 samples
EXPECT_EQ(j, 10);
// Manually terminate the pipeline
// Delete temp file
EXPECT_EQ(remove(temp_file.c_str()), 0);
EXPECT_EQ(remove(temp_file_db.c_str()), 0);
TEST_F(MindDataTestPipeline, TestSaveFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSaveFail with incorrect param.";
// Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", SequentialSampler(0, 10));
EXPECT_NE(ds, nullptr);
// fail with invalid dataset_path
std::string temp_file1 = "";
bool rc1 = ds->Save(temp_file1);
EXPECT_EQ(rc1, false);
// fail with invalid dataset_path
std::string temp_file2 = datasets_root_path_ + "/testCifar10Data/";
bool rc2 = ds->Save(temp_file2);
EXPECT_EQ(rc2, false);
// fail with invalid num_files
std::string temp_file3 = datasets_root_path_ + "/testCifar10Data/mind.mind";
bool rc3 = ds->Save(temp_file3, 0);
EXPECT_EQ(rc3, false);
// fail with invalid num_files
std::string temp_file4 = datasets_root_path_ + "/testCifar10Data/mind.mind";
bool rc4 = ds->Save(temp_file4, 1001);
EXPECT_EQ(rc4, false);
// fail with invalid dataset_type
std::string temp_file5 = datasets_root_path_ + "/testCifar10Data/mind.mind";
bool rc5 = ds->Save(temp_file5, 5, "tfrecord");
EXPECT_EQ(rc5, false);