add randomdataset and schema

pull/4422/head
xiefangqi 5 years ago
parent 2cc6230f81
commit b91e56375e

File diff suppressed because it is too large Load Diff

@ -15,6 +15,7 @@
*/
#include "minddata/dataset/include/de_tensor.h"
#include "minddata/dataset/include/type_id.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/core/data_type.h"
#include "mindspore/core/ir/dtype/type_id.h"
@ -23,68 +24,6 @@
namespace mindspore {
namespace tensor {
dataset::DataType MSTypeToDEType(TypeId data_type) {
switch (data_type) {
case kNumberTypeBool:
return dataset::DataType(dataset::DataType::DE_BOOL);
case kNumberTypeInt8:
return dataset::DataType(dataset::DataType::DE_INT8);
case kNumberTypeUInt8:
return dataset::DataType(dataset::DataType::DE_UINT8);
case kNumberTypeInt16:
return dataset::DataType(dataset::DataType::DE_INT16);
case kNumberTypeUInt16:
return dataset::DataType(dataset::DataType::DE_UINT16);
case kNumberTypeInt32:
return dataset::DataType(dataset::DataType::DE_INT32);
case kNumberTypeUInt32:
return dataset::DataType(dataset::DataType::DE_UINT32);
case kNumberTypeInt64:
return dataset::DataType(dataset::DataType::DE_INT64);
case kNumberTypeUInt64:
return dataset::DataType(dataset::DataType::DE_UINT64);
case kNumberTypeFloat16:
return dataset::DataType(dataset::DataType::DE_FLOAT16);
case kNumberTypeFloat32:
return dataset::DataType(dataset::DataType::DE_FLOAT32);
case kNumberTypeFloat64:
return dataset::DataType(dataset::DataType::DE_FLOAT64);
default:
return dataset::DataType(dataset::DataType::DE_UNKNOWN);
}
}
TypeId DETypeToMSType(dataset::DataType data_type) {
switch (data_type.value()) {
case dataset::DataType::DE_BOOL:
return mindspore::TypeId::kNumberTypeBool;
case dataset::DataType::DE_INT8:
return mindspore::TypeId::kNumberTypeInt8;
case dataset::DataType::DE_UINT8:
return mindspore::TypeId::kNumberTypeUInt8;
case dataset::DataType::DE_INT16:
return mindspore::TypeId::kNumberTypeInt16;
case dataset::DataType::DE_UINT16:
return mindspore::TypeId::kNumberTypeUInt16;
case dataset::DataType::DE_INT32:
return mindspore::TypeId::kNumberTypeInt32;
case dataset::DataType::DE_UINT32:
return mindspore::TypeId::kNumberTypeUInt32;
case dataset::DataType::DE_INT64:
return mindspore::TypeId::kNumberTypeInt64;
case dataset::DataType::DE_UINT64:
return mindspore::TypeId::kNumberTypeUInt64;
case dataset::DataType::DE_FLOAT16:
return mindspore::TypeId::kNumberTypeFloat16;
case dataset::DataType::DE_FLOAT32:
return mindspore::TypeId::kNumberTypeFloat32;
case dataset::DataType::DE_FLOAT64:
return mindspore::TypeId::kNumberTypeFloat64;
default:
return kTypeUnknown;
}
}
MSTensor *DETensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) {
return new DETensor(data_type, shape);
}
@ -100,7 +39,7 @@ DETensor::DETensor(TypeId data_type, const std::vector<int> &shape) {
t_shape.reserve(shape.size());
std::transform(shape.begin(), shape.end(), std::back_inserter(t_shape),
[](int s) -> dataset::dsize_t { return static_cast<dataset::dsize_t>(s); });
dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), MSTypeToDEType(data_type), &this->tensor_impl_);
dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), dataset::MSTypeToDEType(data_type), &this->tensor_impl_);
}
DETensor::DETensor(std::shared_ptr<dataset::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); }
@ -120,14 +59,14 @@ std::shared_ptr<dataset::Tensor> DETensor::tensor() const {
TypeId DETensor::data_type() const {
MS_ASSERT(this->tensor_impl_ != nullptr);
return DETypeToMSType(this->tensor_impl_->type());
return dataset::DETypeToMSType(this->tensor_impl_->type());
}
TypeId DETensor::set_data_type(TypeId data_type) {
MS_ASSERT(this->tensor_impl_ != nullptr);
if (data_type != this->data_type()) {
std::shared_ptr<dataset::Tensor> temp;
dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), MSTypeToDEType(data_type),
dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), dataset::MSTypeToDEType(data_type),
this->tensor_impl_->GetBuffer(), &temp);
this->tensor_impl_ = temp;
}

@ -50,13 +50,6 @@ Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) {
std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_,
builder_total_rows_, std::move(builder_data_schema_), std::move(builder_sampler_));
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
// schema.
// See details of generateSchema function to learn what type of schema it will create.
if ((*out_op)->data_schema_ == nullptr) {
RETURN_IF_NOT_OK((*out_op)->GenerateSchema());
}
return Status::OK();
}
@ -85,6 +78,12 @@ RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64
if (total_rows_ == 0) {
total_rows_ = GenRandomInt(1, kMaxTotalRows);
}
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
// schema.
// See details of generateSchema function to learn what type of schema it will create.
if (data_schema_ == nullptr) {
GenerateSchema();
}
// Everyone is already out from the sync area.
all_out_.Set();
}
@ -106,11 +105,7 @@ void RandomDataOp::Print(std::ostream &out, bool show_all) const {
}
// Helper function to produce a default/random schema if one didn't exist
Status RandomDataOp::GenerateSchema() {
if (data_schema_ != nullptr) {
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Generating a schema but one already exists!");
}
void RandomDataOp::GenerateSchema() {
// To randomly create a schema, we need to choose:
// a) how many columns
// b) the type of each column
@ -144,8 +139,6 @@ Status RandomDataOp::GenerateSchema() {
data_schema_->AddColumn(*newCol);
}
return Status::OK();
}
// Class functor operator () override.

@ -213,9 +213,8 @@ class RandomDataOp : public ParallelOp {
/**
* Helper function to produce a default/random schema if one didn't exist
@return Status - The error code return
*/
Status GenerateSchema();
*/
void GenerateSchema();
/**
* Performs a synchronization between workers at the end of an epoch

@ -24,9 +24,11 @@
#include <utility>
#include <string>
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/engine/data_schema.h"
#include "minddata/dataset/include/tensor.h"
#include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/type_id.h"
namespace mindspore {
namespace dataset {
@ -40,6 +42,7 @@ class TensorShape;
namespace api {
class TensorOperation;
class SchemaObj;
class SamplerObj;
// Datasets classes (in alphabetical order)
class CelebADataset;
@ -49,6 +52,7 @@ class CLUEDataset;
class CocoDataset;
class ImageFolderDataset;
class MnistDataset;
class RandomDataset;
class TextFileDataset;
class VOCDataset;
// Dataset Op classes (in alphabetical order)
@ -63,6 +67,11 @@ class SkipDataset;
class TakeDataset;
class ZipDataset;
/// \brief Function to create a SchemaObj
/// \param[in] schema_file Path of schema file
/// \return Shared pointer to the current schema
std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");
/// \brief Function to create a CelebADataset
/// \notes The generated dataset has two columns ['image', 'attr'].
// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
@ -167,6 +176,21 @@ std::shared_ptr<MnistDataset> Mnist(std::string dataset_dir, std::shared_ptr<Sam
std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &datasets1,
const std::shared_ptr<Dataset> &datasets2);
/// \brief Function to create a RandomDataset
/// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random)
/// \param[in] schema SchemaObj to set column type, data type and data shape
/// \param[in] columns_list List of columns to be read (default=None, read all columns)
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current Dataset
template <typename T = std::shared_ptr<SchemaObj>>
std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schema = nullptr,
std::vector<std::string> columns_list = {},
std::shared_ptr<SamplerObj> sampler = nullptr) {
auto ds = std::make_shared<RandomDataset>(total_rows, schema, std::move(columns_list), std::move(sampler));
return ds->ValidateParams() ? ds : nullptr;
}
/// \brief Function to create a TextFileDataset
/// \notes The generated dataset has one column ['text']
/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list
@ -335,6 +359,66 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
int32_t worker_connector_size_;
};
class SchemaObj {
public:
/// \brief Constructor
explicit SchemaObj(const std::string &schema_file = "");
/// \brief Destructor
~SchemaObj() = default;
/// \brief SchemaObj init function
/// \return bool true if schema init success
bool init();
/// \brief Add new column to the schema
/// \param[in] name name of the column.
/// \param[in] de_type data type of the column(TypeId).
/// \param[in] shape shape of the column.
/// \return bool true if schema init success
bool add_column(std::string name, TypeId de_type, std::vector<int32_t> shape);
/// \brief Add new column to the schema
/// \param[in] name name of the column.
/// \param[in] de_type data type of the column(std::string).
/// \param[in] shape shape of the column.
/// \return bool true if schema init success
bool add_column(std::string name, std::string de_type, std::vector<int32_t> shape);
/// \brief Get a JSON string of the schema
/// \return JSON string of the schema
std::string to_json();
/// \brief Get a JSON string of the schema
std::string to_string() { return to_json(); }
/// \brief set a new value to dataset_type
inline void set_dataset_type(std::string dataset_type) { dataset_type_ = dataset_type; }
/// \brief set a new value to num_rows
inline void set_num_rows(int32_t num_rows) { num_rows_ = num_rows; }
/// \brief get the current num_rows
inline int32_t get_num_rows() { return num_rows_; }
private:
/// \brief Parse the columns and add it to columns
/// \param[in] columns dataset attribution information, decoded from schema file.
/// support both nlohmann::json::value_t::array and nlohmann::json::value_t::onject.
/// \return JSON string of the schema
bool parse_column(nlohmann::json columns);
/// \brief Get schema file from json file
/// \param[in] json_obj object of json parsed.
/// \return bool true if json dump success
bool from_json(nlohmann::json json_obj);
int32_t num_rows_;
std::string dataset_type_;
std::string schema_file_;
nlohmann::json columns_;
};
/* ####################################### Derived Dataset classes ################################# */
// DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS
@ -517,6 +601,53 @@ class MnistDataset : public Dataset {
std::shared_ptr<SamplerObj> sampler_;
};
class RandomDataset : public Dataset {
public:
// Some constants to provide limits to random generation.
static constexpr int32_t kMaxNumColumns = 4;
static constexpr int32_t kMaxRank = 4;
static constexpr int32_t kMaxDimValue = 32;
/// \brief Constructor
RandomDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema, std::vector<std::string> columns_list,
std::shared_ptr<SamplerObj> sampler)
: total_rows_(total_rows),
schema_path_(""),
schema_(std::move(schema)),
columns_list_(columns_list),
sampler_(std::move(sampler)) {}
/// \brief Constructor
RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector<std::string> columns_list,
std::shared_ptr<SamplerObj> sampler)
: total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {}
/// \brief Destructor
~RandomDataset() = default;
/// \brief a base class override function to create the required runtime dataset op objects for this class
/// \return The list of shared pointers to the newly created DatasetOps
std::vector<std::shared_ptr<DatasetOp>> Build() override;
/// \brief Parameters validation
/// \return bool true if all the params are valid
bool ValidateParams() override;
private:
/// \brief A quick inline for producing a random number between (and including) min/max
/// \param[in] min minimum number that can be generated.
/// \param[in] max maximum number that can be generated.
/// \return The generated random number
int32_t GenRandomInt(int32_t min, int32_t max);
int32_t total_rows_;
std::string schema_path_;
std::shared_ptr<SchemaObj> schema_;
std::vector<std::string> columns_list_;
std::shared_ptr<SamplerObj> sampler_;
std::mt19937 rand_gen_;
};
/// \class TextFileDataset
/// \brief A Dataset derived class to represent TextFile dataset
class TextFileDataset : public Dataset {

@ -0,0 +1,88 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
#include "minddata/dataset/core/data_type.h"
#include "mindspore/core/ir/dtype/type_id.h"
namespace mindspore {
namespace dataset {
inline dataset::DataType MSTypeToDEType(TypeId data_type) {
switch (data_type) {
case kNumberTypeBool:
return dataset::DataType(dataset::DataType::DE_BOOL);
case kNumberTypeInt8:
return dataset::DataType(dataset::DataType::DE_INT8);
case kNumberTypeUInt8:
return dataset::DataType(dataset::DataType::DE_UINT8);
case kNumberTypeInt16:
return dataset::DataType(dataset::DataType::DE_INT16);
case kNumberTypeUInt16:
return dataset::DataType(dataset::DataType::DE_UINT16);
case kNumberTypeInt32:
return dataset::DataType(dataset::DataType::DE_INT32);
case kNumberTypeUInt32:
return dataset::DataType(dataset::DataType::DE_UINT32);
case kNumberTypeInt64:
return dataset::DataType(dataset::DataType::DE_INT64);
case kNumberTypeUInt64:
return dataset::DataType(dataset::DataType::DE_UINT64);
case kNumberTypeFloat16:
return dataset::DataType(dataset::DataType::DE_FLOAT16);
case kNumberTypeFloat32:
return dataset::DataType(dataset::DataType::DE_FLOAT32);
case kNumberTypeFloat64:
return dataset::DataType(dataset::DataType::DE_FLOAT64);
default:
return dataset::DataType(dataset::DataType::DE_UNKNOWN);
}
}
inline TypeId DETypeToMSType(dataset::DataType data_type) {
switch (data_type.value()) {
case dataset::DataType::DE_BOOL:
return mindspore::TypeId::kNumberTypeBool;
case dataset::DataType::DE_INT8:
return mindspore::TypeId::kNumberTypeInt8;
case dataset::DataType::DE_UINT8:
return mindspore::TypeId::kNumberTypeUInt8;
case dataset::DataType::DE_INT16:
return mindspore::TypeId::kNumberTypeInt16;
case dataset::DataType::DE_UINT16:
return mindspore::TypeId::kNumberTypeUInt16;
case dataset::DataType::DE_INT32:
return mindspore::TypeId::kNumberTypeInt32;
case dataset::DataType::DE_UINT32:
return mindspore::TypeId::kNumberTypeUInt32;
case dataset::DataType::DE_INT64:
return mindspore::TypeId::kNumberTypeInt64;
case dataset::DataType::DE_UINT64:
return mindspore::TypeId::kNumberTypeUInt64;
case dataset::DataType::DE_FLOAT16:
return mindspore::TypeId::kNumberTypeFloat16;
case dataset::DataType::DE_FLOAT32:
return mindspore::TypeId::kNumberTypeFloat32;
case dataset::DataType::DE_FLOAT64:
return mindspore::TypeId::kNumberTypeFloat64;
default:
return kTypeUnknown;
}
}
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_

@ -100,6 +100,7 @@ SET(DE_UT_SRCS
c_api_dataset_clue_test.cc
c_api_dataset_coco_test.cc
c_api_dataset_filetext_test.cc
c_api_dataset_randomdata_test.cc
c_api_dataset_voc_test.cc
c_api_datasets_test.cc
c_api_dataset_iterator_test.cc

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save