-Add DE_STRING

-replace switch'case by indexing

- Add test case
- Add constructors
- Add getItem string

- Fix bugs
- Add more tests

- Tensor iterator
- asNumpy
- TextFileDataset

- Tensor(Numpy)

- Super > 2D
- Add more test cases for GeneratorDataset

- Change StartAddr to GetBuffer and GetMutableNuffer

- Raise an error if batch is used with strings

Clean-up work
pull/1192/head
hesham 5 years ago
parent a3110549ea
commit e8ca243364

@ -237,6 +237,11 @@ void bindTensor(py::module *m) {
.def("type", &Tensor::type)
.def("as_array", [](py::object &t) {
auto &tensor = py::cast<Tensor &>(t);
if (tensor.type() == DataType::DE_STRING) {
py::array res;
tensor.GetDataAsNumpyStrings(&res);
return res;
}
py::buffer_info info;
THROW_IF_ERROR(Tensor::GetBufferInfo(tensor, &info));
return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t);

@ -24,15 +24,15 @@
namespace mindspore {
namespace dataset {
CVTensor::CVTensor(const TensorShape &shape, const DataType &type) : Tensor(shape, type) {
(void)this->MatInit(StartAddr(), shape_, type_, &mat_);
(void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_);
}
CVTensor::CVTensor(const TensorShape &shape, const DataType &type, const uchar *data) : Tensor(shape, type, data) {
(void)this->MatInit(StartAddr(), shape_, type_, &mat_);
(void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_);
}
CVTensor::CVTensor(std::shared_ptr<Tensor> tensor) : Tensor(std::move(*tensor)) {
(void)this->MatInit(StartAddr(), shape_, type_, &mat_);
(void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_);
}
std::pair<std::array<int, 2>, int> CVTensor::IsValidImage(const TensorShape &shape, const DataType &type) {
@ -83,19 +83,19 @@ Status CVTensor::MatInit(uchar *data, const TensorShape &shape, const DataType &
Status CVTensor::Reshape(const TensorShape &shape) {
RETURN_IF_NOT_OK(Tensor::Reshape(shape));
RETURN_IF_NOT_OK(this->MatInit(StartAddr(), shape_, type_, &mat_));
RETURN_IF_NOT_OK(this->MatInit(GetMutableBuffer(), shape_, type_, &mat_));
return Status::OK();
}
Status CVTensor::ExpandDim(const dsize_t &axis) {
RETURN_IF_NOT_OK(Tensor::ExpandDim(axis));
RETURN_IF_NOT_OK(this->MatInit(StartAddr(), shape_, type_, &mat_));
RETURN_IF_NOT_OK(this->MatInit(GetMutableBuffer(), shape_, type_, &mat_));
return Status::OK();
}
void CVTensor::Squeeze() {
Tensor::Squeeze();
(void)this->MatInit(StartAddr(), shape_, type_, &mat_);
(void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_);
}
} // namespace dataset
} // namespace mindspore

File diff suppressed because it is too large Load Diff

@ -16,18 +16,25 @@
#ifndef DATASET_CORE_DATA_TYPE_H_
#define DATASET_CORE_DATA_TYPE_H_
#include <opencv2/core/hal/interface.h>
#include <string>
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "dataset/core/constants.h"
#include "dataset/core/pybind_support.h"
namespace py = pybind11;
namespace mindspore {
namespace dataset {
// Class that represents basic data types in DataEngine.
class DataType {
public:
enum Type : uint8_t {
DE_UNKNOWN = 0,
DE_BOOL,
DE_INT8,
DE_UINT8,
@ -40,20 +47,60 @@ class DataType {
DE_FLOAT16,
DE_FLOAT32,
DE_FLOAT64,
DE_UNKNOWN
DE_STRING,
NUM_OF_TYPES
};
static constexpr uint8_t DE_BOOL_SIZE = 1;
static constexpr uint8_t DE_UINT8_SIZE = 1;
static constexpr uint8_t DE_INT8_SIZE = 1;
static constexpr uint8_t DE_UINT16_SIZE = 2;
static constexpr uint8_t DE_INT16_SIZE = 2;
static constexpr uint8_t DE_UINT32_SIZE = 4;
static constexpr uint8_t DE_INT32_SIZE = 4;
static constexpr uint8_t DE_INT64_SIZE = 8;
static constexpr uint8_t DE_UINT64_SIZE = 8;
static constexpr uint8_t DE_FLOAT32_SIZE = 4;
static constexpr uint8_t DE_FLOAT64_SIZE = 8;
inline static constexpr uint8_t SIZE_IN_BYTES[] = {0, // DE_UNKNOWN
1, // DE_BOOL
1, // DE_INT8
1, // DE_UINT8
2, // DE_INT16
2, // DE_UINT16
4, // DE_INT32
4, // DE_UINT32
8, // DE_INT64
8, // DE_UINT64
2, // DE_FLOAT16
4, // DE_FLOAT32
8, // DE_FLOAT64
0}; // DE_STRING
inline static const char *TO_STRINGS[] = {"unknown", "bool", "int8", "uint8", "int16", "uint16", "int32",
"uint32", "int64", "uint64", "float16", "float32", "float64", "string"};
inline static const char *PYBIND_TYPES[] = {"object", "bool", "int8", "uint8", "int16", "uint16", "int32",
"uint32", "int64", "uint64", "float16", "float32", "double", "bytes"};
inline static const std::string PYBIND_FORMAT_DESCRIPTOR[] = {"", // DE_UNKNOWN
py::format_descriptor<bool>::format(), // DE_BOOL
py::format_descriptor<int8_t>::format(), // DE_INT8
py::format_descriptor<uint8_t>::format(), // DE_UINT8
py::format_descriptor<int16_t>::format(), // DE_INT16
py::format_descriptor<uint16_t>::format(), // DE_UINT16
py::format_descriptor<int32_t>::format(), // DE_INT32
py::format_descriptor<uint32_t>::format(), // DE_UINT32
py::format_descriptor<int64_t>::format(), // DE_INT64
py::format_descriptor<uint64_t>::format(), // DE_UINT64
"e", // DE_FLOAT16
py::format_descriptor<float>::format(), // DE_FLOAT32
py::format_descriptor<double>::format(), // DE_FLOAT64
"S"}; // DE_STRING
inline static constexpr uint8_t CV_TYPES[] = {kCVInvalidType, // DE_UNKNOWN
CV_8U, // DE_BOOL
CV_8S, // DE_INT8
CV_8U, // DE_UINT8
CV_16S, // DE_INT16
CV_16U, // DE_UINT16
CV_32S, // DE_INT32
kCVInvalidType, // DE_UINT32
kCVInvalidType, // DE_INT64
kCVInvalidType, // DE_UINT64
CV_16F, // DE_FLOAT16
CV_32F, // DE_FLOAT32
CV_64F, // DE_FLOAT64
kCVInvalidType}; // DE_STRING
// No arg constructor to create an unknown shape
DataType() : type_(DE_UNKNOWN) {}
@ -160,6 +207,8 @@ class DataType {
bool IsBool() const { return type_ == DataType::DE_BOOL; }
bool IsNumeric() const { return type_ != DataType::DE_STRING; }
Type value() const { return type_; }
private:
@ -226,6 +275,11 @@ inline bool DataType::IsCompatible<uint8_t>() const {
return type_ == DataType::DE_UINT8;
}
template <>
inline bool DataType::IsCompatible<std::string_view>() const {
return type_ == DataType::DE_STRING;
}
template <>
inline bool DataType::IsLooselyCompatible<bool>() const {
return type_ == DataType::DE_BOOL;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -215,5 +215,17 @@ TensorShape TensorShape::Squeeze() const {
}
return TensorShape(new_shape);
}
std::vector<dsize_t> TensorShape::Strides() {
std::vector<dsize_t> strides(Rank());
dsize_t count = NumOfElements();
for (dsize_t i = 0; i < Rank(); i++) {
if (raw_shape_[i] != 0)
count /= raw_shape_[i];
else
count = 0;
strides[i] = count;
}
return strides;
}
} // namespace dataset
} // namespace mindspore

@ -156,6 +156,8 @@ class TensorShape {
TensorShape Squeeze() const;
std::vector<dsize_t> Strides();
private:
// True if known and valid shape, false otherwise
bool known_;

@ -74,6 +74,10 @@ Status BatchOp::operator()() {
std::unique_ptr<TensorQTable> table = std::make_unique<TensorQTable>();
child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
for (const auto &t : new_row) {
CHECK_FAIL_RETURN_UNEXPECTED(t->type().IsNumeric(),
"[Batch ERROR] Batch does not support Tensor of type string yet.");
}
RETURN_IF_NOT_OK(DatasetOp::AssignColMapFromChild()); // must come after the first fetch above
int32_t cur_batch_size = 0;
RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(0, 0, 0)));
@ -445,8 +449,8 @@ Status BatchOp::PadHelper(std::shared_ptr<Tensor> src, std::shared_ptr<Tensor> d
src_flat_ind += src_s[i] * cur_ind[i];
dst_flat_ind += dst_s[i] * cur_ind[i];
}
unsigned char *src_addr = src->StartAddr() + src_flat_ind * type_size;
unsigned char *dst_addr = dst->StartAddr() + dst_flat_ind * type_size;
unsigned char *src_addr = src->GetMutableBuffer() + src_flat_ind * type_size;
unsigned char *dst_addr = dst->GetMutableBuffer() + dst_flat_ind * type_size;
CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(dst_addr, len, src_addr, len) == 0, "memcpy error");
} else { // not the last dimension, keep doing recursion
dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]);

@ -85,6 +85,13 @@ Status DeviceQueueOp::operator()() {
Status DeviceQueueOp::CheckExceptions(const std::unique_ptr<DataBuffer> &buffer) const {
// this method checks if the buffer meets the conditions to be sent to TDT
if (buffer->NumRows() != 0) {
TensorRow row;
buffer->GetRow(0, &row);
for (const auto &item : row) {
CHECK_FAIL_RETURN_UNEXPECTED(item->type().IsNumeric(), "Cannot send tensor of string type to device.");
}
}
return Status::OK();
}
@ -207,7 +214,7 @@ Status DeviceQueueOp::MallocForGPUData(std::vector<device::DataItemGpu> *items,
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "memory malloc failed.");
}
(void)memset_s(sub_item.data_ptr_, sub_item.data_len_, 0, sub_item.data_len_);
unsigned char *column_data = curr_row[i]->StartAddr();
unsigned char *column_data = curr_row[i]->GetMutableBuffer();
if (memcpy_s(sub_item.data_ptr_, sub_item.data_len_, column_data,
static_cast<uint32_t>(curr_row[i++]->SizeInBytes())) != 0) {
MS_LOG(ERROR) << "memcpy_s failed!";

@ -407,7 +407,7 @@ Status CelebAOp::LoadTensorRow(const std::pair<std::string, std::vector<int32_t>
RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(),
TensorShape(std::vector<dsize_t>(1, num_elements)),
data_schema_->column(0).type()));
(void)handle.read(reinterpret_cast<char *>(image->StartAddr()), num_elements);
(void)handle.read(reinterpret_cast<char *>(image->GetMutableBuffer()), num_elements);
if (decode_ == true) {
Status rc = Decode(image, &image);
if (rc.IsError()) {

@ -197,7 +197,7 @@ Status CifarOp::LoadTensorRow(uint64_t index, TensorRow *trow) {
std::shared_ptr<Tensor> fine_label;
std::shared_ptr<Tensor> ori_image = cifar_image_label_pairs_[index].first;
std::shared_ptr<Tensor> copy_image =
std::make_shared<Tensor>(ori_image->shape(), ori_image->type(), ori_image->StartAddr());
std::make_shared<Tensor>(ori_image->shape(), ori_image->type(), ori_image->GetMutableBuffer());
RETURN_IF_NOT_OK(Tensor::CreateTensor(&label, data_schema_->column(1).tensorImpl(), data_schema_->column(1).shape(),
data_schema_->column(1).type(),
reinterpret_cast<unsigned char *>(&cifar_image_label_pairs_[index].second[0])));
@ -394,7 +394,7 @@ Status CifarOp::ParseCifarData() {
data_schema_->column(0).type()));
for (int ch = 0; ch < kCifarImageChannel; ++ch) {
for (int pix = 0; pix < kCifarImageHeight * kCifarImageWidth; ++pix) {
(image_tensor->StartAddr())[pix * kCifarImageChannel + ch] = block[cur_block_index++];
(image_tensor->GetMutableBuffer())[pix * kCifarImageChannel + ch] = block[cur_block_index++];
}
}
cifar_image_label_pairs_.emplace_back(std::make_pair(image_tensor, labels));

@ -216,7 +216,7 @@ Status ImageFolderOp::LoadTensorRow(ImageLabelPair pairPtr, TensorRow *trow) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(),
TensorShape(std::vector<dsize_t>(1, num_elements)),
data_schema_->column(0).type(), nullptr));
(void)fs.read(reinterpret_cast<char *>(image->StartAddr()), num_elements);
(void)fs.read(reinterpret_cast<char *>(image->GetMutableBuffer()), num_elements);
fs.close();
if (decode_ == true) {
Status rc = Decode(image, &image);

@ -210,7 +210,7 @@ Status ManifestOp::LoadTensorRow(const std::pair<std::string, std::vector<std::s
RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(),
TensorShape(std::vector<dsize_t>(1, num_elements)),
data_schema_->column(0).type(), nullptr));
(void)fs.read(reinterpret_cast<char *>(image->StartAddr()), num_elements);
(void)fs.read(reinterpret_cast<char *>(image->GetMutableBuffer()), num_elements);
if (fs.fail()) {
fs.close();
RETURN_STATUS_UNEXPECTED("Fail to read file: " + data.first);

@ -170,7 +170,7 @@ Status MnistOp::LoadTensorRow(const MnistLabelPair &mnist_pair, TensorRow *trow)
int32_t l = mnist_pair.second;
// make a copy of cached tensor
RETURN_IF_NOT_OK(Tensor::CreateTensor(&image, data_schema_->column(0).tensorImpl(), mnist_pair.first->shape(),
mnist_pair.first->type(), mnist_pair.first->StartAddr()));
mnist_pair.first->type(), mnist_pair.first->GetMutableBuffer()));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&label, data_schema_->column(1).tensorImpl(), data_schema_->column(1).shape(),
data_schema_->column(1).type(), reinterpret_cast<unsigned char *>(&l)));
(*trow) = {std::move(image), std::move(label)};

@ -127,7 +127,7 @@ Status RandomDataOp::GenerateSchema() {
// For each column:
// - choose a datatype
// - generate a shape that randomly chooses the number of dimensions and the dimension values.
DataType::Type newType = static_cast<DataType::Type>(GenRandomInt(0, kMaxDataType));
DataType::Type newType = static_cast<DataType::Type>(GenRandomInt(0, DataType::NUM_OF_TYPES - 2));
int32_t rank = GenRandomInt(1, kMaxRank);
std::vector<dsize_t> dims;
for (int32_t d = 0; d < rank; d++) {

@ -43,7 +43,6 @@ class RandomDataOp : public ParallelOp {
static constexpr int32_t kMaxNumColumns = 4;
static constexpr int32_t kMaxRank = 4;
static constexpr int32_t kMaxDimValue = 2048;
static constexpr int32_t kMaxDataType = (DataType::DE_UNKNOWN - 1);
static constexpr int32_t kMaxTotalRows = 1024;
// A nested builder class to aid in the construction of a RandomDataOp

@ -58,7 +58,7 @@ Status DistributedSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer
(*out_buffer) = std::make_unique<DataBuffer>(cnt_, DataBuffer::kDeBFlagNone);
std::shared_ptr<Tensor> sample_ids;
RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, samples_per_buffer_));
int64_t *id_ptr = reinterpret_cast<int64_t *>(sample_ids->StartAddr());
int64_t *id_ptr = reinterpret_cast<int64_t *>(sample_ids->GetMutableBuffer());
while (cnt_ < samples_per_buffer_) {
int64_t next_id = (num_devices_ * (cnt_++) + device_id_) % num_rows_;
*(id_ptr++) = shuffle_ ? shuffle_vec_[static_cast<size_t>(next_id)] : next_id;

@ -58,7 +58,7 @@ Status PKSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
int64_t last_id =
(samples_per_buffer_ + next_id_ > num_pk_samples_) ? num_pk_samples_ : samples_per_buffer_ + next_id_;
RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, last_id - next_id_));
int64_t *id_ptr = reinterpret_cast<int64_t *>(sample_ids->StartAddr());
int64_t *id_ptr = reinterpret_cast<int64_t *>(sample_ids->GetMutableBuffer());
while (next_id_ < last_id) {
int64_t cls_id = next_id_++ / samples_per_class_;
const std::vector<int64_t> &samples = label_to_ids_[labels_[cls_id]];

@ -38,7 +38,7 @@ Status RandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
std::shared_ptr<Tensor> sampleIds;
int64_t last_id = samples_per_buffer_ + next_id_ > num_samples_ ? num_samples_ : samples_per_buffer_ + next_id_;
RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, last_id - next_id_));
int64_t *id_ptr = reinterpret_cast<int64_t *>(sampleIds->StartAddr());
int64_t *id_ptr = reinterpret_cast<int64_t *>(sampleIds->GetMutableBuffer());
for (int64_t i = 0; i < (last_id - next_id_); i++) {
*(id_ptr + i) = replacement_ ? (*dist)(rnd_) : shuffled_ids_[static_cast<size_t>(i + next_id_)];
}

@ -40,7 +40,7 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t
}
TensorShape shape(std::vector<dsize_t>(1, num_elements));
RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type()));
(void)(*sample_ids)->StartAddr(); // allocate memory in case user forgets!
(void)(*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets!
return Status::OK();
}

@ -31,7 +31,7 @@ Status SequentialSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer)
std::shared_ptr<Tensor> sampleIds;
int64_t lastId = (samples_per_buffer_ + next_id_ > num_samples_) ? num_samples_ : samples_per_buffer_ + next_id_;
RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, lastId - next_id_));
int64_t *idPtr = reinterpret_cast<int64_t *>(sampleIds->StartAddr());
int64_t *idPtr = reinterpret_cast<int64_t *>(sampleIds->GetMutableBuffer());
while (next_id_ < lastId) {
*(idPtr++) = next_id_++;
}

@ -78,7 +78,7 @@ Status SubsetRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffe
RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_));
// Initialize tensor
int64_t *id_ptr = reinterpret_cast<int64_t *>(outputIds->StartAddr());
int64_t *id_ptr = reinterpret_cast<int64_t *>(outputIds->GetMutableBuffer());
while (sample_id_ < last_id) {
if (indices_[sample_id_] >= num_rows_) {
std::string err_msg =

@ -111,7 +111,7 @@ Status WeightedRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buf
RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_));
// Initialize tensor.
int64_t *id_ptr = reinterpret_cast<int64_t *>(outputIds->StartAddr());
int64_t *id_ptr = reinterpret_cast<int64_t *>(outputIds->GetMutableBuffer());
// Assign the data to tensor element.
while (sample_id_ < last_id) {
int64_t genId;

@ -146,10 +146,7 @@ Status TextFileOp::LoadTensor(const std::string &line, std::unique_ptr<TensorQTa
(*tensor_table)->push_back(std::move(tRow));
std::shared_ptr<Tensor> tensor;
RETURN_IF_NOT_OK(
Tensor::CreateTensor(&tensor, data_schema_->column(0).tensorImpl(),
TensorShape(std::vector<dsize_t>(1, line.size())), data_schema_->column(0).type(),
const_cast<unsigned char *>(reinterpret_cast<const unsigned char *>(common::SafeCStr(line)))));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, {line}, TensorShape::CreateScalar()));
(**tensor_table)[row][0] = std::move(tensor);
return Status::OK();
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save