add randomdataset and schema

5 years ago · b91e56375e
parent 2cc6230f81
commit b91e56375e
8 changed files with 732 additions and 82 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
--- a/mindspore/ccsrc/minddata/dataset/api/de_tensor.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/de_tensor.cc
@ -15,6 +15,7 @@
 */

 #include "minddata/dataset/include/de_tensor.h"
+#include "minddata/dataset/include/type_id.h"
 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/core/data_type.h"
 #include "mindspore/core/ir/dtype/type_id.h"
@ -23,68 +24,6 @@

 namespace mindspore {
 namespace tensor {
-dataset::DataType MSTypeToDEType(TypeId data_type) {
-  switch (data_type) {
-    case kNumberTypeBool:
-      return dataset::DataType(dataset::DataType::DE_BOOL);
-    case kNumberTypeInt8:
-      return dataset::DataType(dataset::DataType::DE_INT8);
-    case kNumberTypeUInt8:
-      return dataset::DataType(dataset::DataType::DE_UINT8);
-    case kNumberTypeInt16:
-      return dataset::DataType(dataset::DataType::DE_INT16);
-    case kNumberTypeUInt16:
-      return dataset::DataType(dataset::DataType::DE_UINT16);
-    case kNumberTypeInt32:
-      return dataset::DataType(dataset::DataType::DE_INT32);
-    case kNumberTypeUInt32:
-      return dataset::DataType(dataset::DataType::DE_UINT32);
-    case kNumberTypeInt64:
-      return dataset::DataType(dataset::DataType::DE_INT64);
-    case kNumberTypeUInt64:
-      return dataset::DataType(dataset::DataType::DE_UINT64);
-    case kNumberTypeFloat16:
-      return dataset::DataType(dataset::DataType::DE_FLOAT16);
-    case kNumberTypeFloat32:
-      return dataset::DataType(dataset::DataType::DE_FLOAT32);
-    case kNumberTypeFloat64:
-      return dataset::DataType(dataset::DataType::DE_FLOAT64);
-    default:
-      return dataset::DataType(dataset::DataType::DE_UNKNOWN);
-  }
-}
-
-TypeId DETypeToMSType(dataset::DataType data_type) {
-  switch (data_type.value()) {
-    case dataset::DataType::DE_BOOL:
-      return mindspore::TypeId::kNumberTypeBool;
-    case dataset::DataType::DE_INT8:
-      return mindspore::TypeId::kNumberTypeInt8;
-    case dataset::DataType::DE_UINT8:
-      return mindspore::TypeId::kNumberTypeUInt8;
-    case dataset::DataType::DE_INT16:
-      return mindspore::TypeId::kNumberTypeInt16;
-    case dataset::DataType::DE_UINT16:
-      return mindspore::TypeId::kNumberTypeUInt16;
-    case dataset::DataType::DE_INT32:
-      return mindspore::TypeId::kNumberTypeInt32;
-    case dataset::DataType::DE_UINT32:
-      return mindspore::TypeId::kNumberTypeUInt32;
-    case dataset::DataType::DE_INT64:
-      return mindspore::TypeId::kNumberTypeInt64;
-    case dataset::DataType::DE_UINT64:
-      return mindspore::TypeId::kNumberTypeUInt64;
-    case dataset::DataType::DE_FLOAT16:
-      return mindspore::TypeId::kNumberTypeFloat16;
-    case dataset::DataType::DE_FLOAT32:
-      return mindspore::TypeId::kNumberTypeFloat32;
-    case dataset::DataType::DE_FLOAT64:
-      return mindspore::TypeId::kNumberTypeFloat64;
-    default:
-      return kTypeUnknown;
-  }
-}
-
 MSTensor *DETensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) {
  return new DETensor(data_type, shape);
 }
@ -100,7 +39,7 @@ DETensor::DETensor(TypeId data_type, const std::vector<int> &shape) {
  t_shape.reserve(shape.size());
  std::transform(shape.begin(), shape.end(), std::back_inserter(t_shape),
                 [](int s) -> dataset::dsize_t { return static_cast<dataset::dsize_t>(s); });
-  dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), MSTypeToDEType(data_type), &this->tensor_impl_);
+  dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), dataset::MSTypeToDEType(data_type), &this->tensor_impl_);
 }

 DETensor::DETensor(std::shared_ptr<dataset::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); }
@ -120,14 +59,14 @@ std::shared_ptr<dataset::Tensor> DETensor::tensor() const {

 TypeId DETensor::data_type() const {
  MS_ASSERT(this->tensor_impl_ != nullptr);
-  return DETypeToMSType(this->tensor_impl_->type());
+  return dataset::DETypeToMSType(this->tensor_impl_->type());
 }

 TypeId DETensor::set_data_type(TypeId data_type) {
  MS_ASSERT(this->tensor_impl_ != nullptr);
  if (data_type != this->data_type()) {
    std::shared_ptr<dataset::Tensor> temp;
-    dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), MSTypeToDEType(data_type),
+    dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), dataset::MSTypeToDEType(data_type),
                                      this->tensor_impl_->GetBuffer(), &temp);
    this->tensor_impl_ = temp;
  }
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.cc
@ -50,13 +50,6 @@ Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) {
    std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_,
                                   builder_total_rows_, std::move(builder_data_schema_), std::move(builder_sampler_));

-  // If the user did not provide a schema, then we will ask the op to generate a pseudo-random
-  // schema.
-  // See details of generateSchema function to learn what type of schema it will create.
-  if ((*out_op)->data_schema_ == nullptr) {
-    RETURN_IF_NOT_OK((*out_op)->GenerateSchema());
-  }
-
  return Status::OK();
 }

@ -85,6 +78,12 @@ RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64
  if (total_rows_ == 0) {
    total_rows_ = GenRandomInt(1, kMaxTotalRows);
  }
+  // If the user did not provide a schema, then we will ask the op to generate a pseudo-random
+  // schema.
+  // See details of generateSchema function to learn what type of schema it will create.
+  if (data_schema_ == nullptr) {
+    GenerateSchema();
+  }
  // Everyone is already out from the sync area.
  all_out_.Set();
 }
@ -106,11 +105,7 @@ void RandomDataOp::Print(std::ostream &out, bool show_all) const {
 }

 // Helper function to produce a default/random schema if one didn't exist
-Status RandomDataOp::GenerateSchema() {
-  if (data_schema_ != nullptr) {
-    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Generating a schema but one already exists!");
-  }
-
+void RandomDataOp::GenerateSchema() {
  // To randomly create a schema, we need to choose:
  // a) how many columns
  // b) the type of each column
@ -144,8 +139,6 @@ Status RandomDataOp::GenerateSchema() {

    data_schema_->AddColumn(*newCol);
  }
-
-  return Status::OK();
 }

 // Class functor operator () override.
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.h
@ -213,9 +213,8 @@ class RandomDataOp : public ParallelOp {

  /**
   * Helper function to produce a default/random schema if one didn't exist
-   @return Status - The error code return
-  */
-  Status GenerateSchema();
+   */
+  void GenerateSchema();

  /**
   * Performs a synchronization between workers at the end of an epoch
--- a/mindspore/ccsrc/minddata/dataset/include/datasets.h
+++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h
@ -24,9 +24,11 @@
 #include <utility>
 #include <string>
 #include "minddata/dataset/core/constants.h"
+#include "minddata/dataset/engine/data_schema.h"
 #include "minddata/dataset/include/tensor.h"
 #include "minddata/dataset/include/iterator.h"
 #include "minddata/dataset/include/samplers.h"
+#include "minddata/dataset/include/type_id.h"

 namespace mindspore {
 namespace dataset {
@ -40,6 +42,7 @@ class TensorShape;
 namespace api {

 class TensorOperation;
+class SchemaObj;
 class SamplerObj;
 // Datasets classes (in alphabetical order)
 class CelebADataset;
@ -49,6 +52,7 @@ class CLUEDataset;
 class CocoDataset;
 class ImageFolderDataset;
 class MnistDataset;
+class RandomDataset;
 class TextFileDataset;
 class VOCDataset;
 // Dataset Op classes (in alphabetical order)
@ -63,6 +67,11 @@ class SkipDataset;
 class TakeDataset;
 class ZipDataset;

+/// \brief Function to create a SchemaObj
+/// \param[in] schema_file Path of schema file
+/// \return Shared pointer to the current schema
+std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");
+
 /// \brief Function to create a CelebADataset
 /// \notes The generated dataset has two columns ['image', 'attr'].
 //     The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
@ -167,6 +176,21 @@ std::shared_ptr<MnistDataset> Mnist(std::string dataset_dir, std::shared_ptr<Sam
 std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &datasets1,
                                         const std::shared_ptr<Dataset> &datasets2);

+/// \brief Function to create a RandomDataset
+/// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random)
+/// \param[in] schema SchemaObj to set column type, data type and data shape
+/// \param[in] columns_list List of columns to be read (default=None, read all columns)
+/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
+///    will be used to randomly iterate the entire dataset
+/// \return Shared pointer to the current Dataset
+template <typename T = std::shared_ptr<SchemaObj>>
+std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schema = nullptr,
+                                          std::vector<std::string> columns_list = {},
+                                          std::shared_ptr<SamplerObj> sampler = nullptr) {
+  auto ds = std::make_shared<RandomDataset>(total_rows, schema, std::move(columns_list), std::move(sampler));
+  return ds->ValidateParams() ? ds : nullptr;
+}
+
 /// \brief Function to create a TextFileDataset
 /// \notes The generated dataset has one column ['text']
 /// \param[in] dataset_files List of files to be read to search for a pattern of files. The list
@ -335,6 +359,66 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
  int32_t worker_connector_size_;
 };

+class SchemaObj {
+ public:
+  /// \brief Constructor
+  explicit SchemaObj(const std::string &schema_file = "");
+
+  /// \brief Destructor
+  ~SchemaObj() = default;
+
+  /// \brief SchemaObj init function
+  /// \return bool true if schema init success
+  bool init();
+
+  /// \brief Add new column to the schema
+  /// \param[in] name name of the column.
+  /// \param[in] de_type data type of the column(TypeId).
+  /// \param[in] shape shape of the column.
+  /// \return bool true if schema init success
+  bool add_column(std::string name, TypeId de_type, std::vector<int32_t> shape);
+
+  /// \brief Add new column to the schema
+  /// \param[in] name name of the column.
+  /// \param[in] de_type data type of the column(std::string).
+  /// \param[in] shape shape of the column.
+  /// \return bool true if schema init success
+  bool add_column(std::string name, std::string de_type, std::vector<int32_t> shape);
+
+  /// \brief Get a JSON string of the schema
+  /// \return JSON string of the schema
+  std::string to_json();
+
+  /// \brief Get a JSON string of the schema
+  std::string to_string() { return to_json(); }
+
+  /// \brief set a new value to dataset_type
+  inline void set_dataset_type(std::string dataset_type) { dataset_type_ = dataset_type; }
+
+  /// \brief set a new value to num_rows
+  inline void set_num_rows(int32_t num_rows) { num_rows_ = num_rows; }
+
+  /// \brief get the current num_rows
+  inline int32_t get_num_rows() { return num_rows_; }
+
+ private:
+  /// \brief Parse the columns and add it to columns
+  /// \param[in] columns dataset attribution information, decoded from schema file.
+  ///    support both nlohmann::json::value_t::array and nlohmann::json::value_t::onject.
+  /// \return JSON string of the schema
+  bool parse_column(nlohmann::json columns);
+
+  /// \brief Get schema file from json file
+  /// \param[in] json_obj object of json parsed.
+  /// \return bool true if json dump success
+  bool from_json(nlohmann::json json_obj);
+
+  int32_t num_rows_;
+  std::string dataset_type_;
+  std::string schema_file_;
+  nlohmann::json columns_;
+};
+
 /* ####################################### Derived Dataset classes ################################# */

 // DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS
@ -517,6 +601,53 @@ class MnistDataset : public Dataset {
  std::shared_ptr<SamplerObj> sampler_;
 };

+class RandomDataset : public Dataset {
+ public:
+  // Some constants to provide limits to random generation.
+  static constexpr int32_t kMaxNumColumns = 4;
+  static constexpr int32_t kMaxRank = 4;
+  static constexpr int32_t kMaxDimValue = 32;
+
+  /// \brief Constructor
+  RandomDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema, std::vector<std::string> columns_list,
+                std::shared_ptr<SamplerObj> sampler)
+      : total_rows_(total_rows),
+        schema_path_(""),
+        schema_(std::move(schema)),
+        columns_list_(columns_list),
+        sampler_(std::move(sampler)) {}
+
+  /// \brief Constructor
+  RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector<std::string> columns_list,
+                std::shared_ptr<SamplerObj> sampler)
+      : total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {}
+
+  /// \brief Destructor
+  ~RandomDataset() = default;
+
+  /// \brief a base class override function to create the required runtime dataset op objects for this class
+  /// \return The list of shared pointers to the newly created DatasetOps
+  std::vector<std::shared_ptr<DatasetOp>> Build() override;
+
+  /// \brief Parameters validation
+  /// \return bool true if all the params are valid
+  bool ValidateParams() override;
+
+ private:
+  /// \brief A quick inline for producing a random number between (and including) min/max
+  /// \param[in] min minimum number that can be generated.
+  /// \param[in] max maximum number that can be generated.
+  /// \return The generated random number
+  int32_t GenRandomInt(int32_t min, int32_t max);
+
+  int32_t total_rows_;
+  std::string schema_path_;
+  std::shared_ptr<SchemaObj> schema_;
+  std::vector<std::string> columns_list_;
+  std::shared_ptr<SamplerObj> sampler_;
+  std::mt19937 rand_gen_;
+};
+
 /// \class TextFileDataset
 /// \brief A Dataset derived class to represent TextFile dataset
 class TextFileDataset : public Dataset {
--- a/mindspore/ccsrc/minddata/dataset/include/type_id.h
+++ b/mindspore/ccsrc/minddata/dataset/include/type_id.h
@ -0,0 +1,88 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
+
+#include "minddata/dataset/core/data_type.h"
+#include "mindspore/core/ir/dtype/type_id.h"
+
+namespace mindspore {
+namespace dataset {
+inline dataset::DataType MSTypeToDEType(TypeId data_type) {
+  switch (data_type) {
+    case kNumberTypeBool:
+      return dataset::DataType(dataset::DataType::DE_BOOL);
+    case kNumberTypeInt8:
+      return dataset::DataType(dataset::DataType::DE_INT8);
+    case kNumberTypeUInt8:
+      return dataset::DataType(dataset::DataType::DE_UINT8);
+    case kNumberTypeInt16:
+      return dataset::DataType(dataset::DataType::DE_INT16);
+    case kNumberTypeUInt16:
+      return dataset::DataType(dataset::DataType::DE_UINT16);
+    case kNumberTypeInt32:
+      return dataset::DataType(dataset::DataType::DE_INT32);
+    case kNumberTypeUInt32:
+      return dataset::DataType(dataset::DataType::DE_UINT32);
+    case kNumberTypeInt64:
+      return dataset::DataType(dataset::DataType::DE_INT64);
+    case kNumberTypeUInt64:
+      return dataset::DataType(dataset::DataType::DE_UINT64);
+    case kNumberTypeFloat16:
+      return dataset::DataType(dataset::DataType::DE_FLOAT16);
+    case kNumberTypeFloat32:
+      return dataset::DataType(dataset::DataType::DE_FLOAT32);
+    case kNumberTypeFloat64:
+      return dataset::DataType(dataset::DataType::DE_FLOAT64);
+    default:
+      return dataset::DataType(dataset::DataType::DE_UNKNOWN);
+  }
+}
+
+inline TypeId DETypeToMSType(dataset::DataType data_type) {
+  switch (data_type.value()) {
+    case dataset::DataType::DE_BOOL:
+      return mindspore::TypeId::kNumberTypeBool;
+    case dataset::DataType::DE_INT8:
+      return mindspore::TypeId::kNumberTypeInt8;
+    case dataset::DataType::DE_UINT8:
+      return mindspore::TypeId::kNumberTypeUInt8;
+    case dataset::DataType::DE_INT16:
+      return mindspore::TypeId::kNumberTypeInt16;
+    case dataset::DataType::DE_UINT16:
+      return mindspore::TypeId::kNumberTypeUInt16;
+    case dataset::DataType::DE_INT32:
+      return mindspore::TypeId::kNumberTypeInt32;
+    case dataset::DataType::DE_UINT32:
+      return mindspore::TypeId::kNumberTypeUInt32;
+    case dataset::DataType::DE_INT64:
+      return mindspore::TypeId::kNumberTypeInt64;
+    case dataset::DataType::DE_UINT64:
+      return mindspore::TypeId::kNumberTypeUInt64;
+    case dataset::DataType::DE_FLOAT16:
+      return mindspore::TypeId::kNumberTypeFloat16;
+    case dataset::DataType::DE_FLOAT32:
+      return mindspore::TypeId::kNumberTypeFloat32;
+    case dataset::DataType::DE_FLOAT64:
+      return mindspore::TypeId::kNumberTypeFloat64;
+    default:
+      return kTypeUnknown;
+  }
+}
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
--- a/tests/ut/cpp/dataset/CMakeLists.txt
+++ b/tests/ut/cpp/dataset/CMakeLists.txt
@ -100,6 +100,7 @@ SET(DE_UT_SRCS
        c_api_dataset_clue_test.cc
        c_api_dataset_coco_test.cc
        c_api_dataset_filetext_test.cc
+        c_api_dataset_randomdata_test.cc
        c_api_dataset_voc_test.cc
        c_api_datasets_test.cc
        c_api_dataset_iterator_test.cc
--- a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc