!10799 make user-facing headers standalone for minddata

From: @mhmotallebi
Reviewed-by: 
Signed-off-by:
pull/10799/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit a477a97278

@ -17,6 +17,7 @@
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/include/config.h"
#include "minddata/dataset/util/log_adapter.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {

@ -19,16 +19,33 @@
#include <fstream>
#include <unordered_set>
#include <utility>
#include "minddata/dataset/engine/runtime_context.h"
#include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/core/client.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/kernels/c_func_op.h"
#include "minddata/dataset/kernels/tensor_op.h"
#ifndef ENABLE_ANDROID
#include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h"
#endif
#ifndef ENABLE_ANDROID
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"
#endif
// Sampler headers (in alphabetical order)
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
// IR non-leaf nodes
#include "minddata/dataset/engine/ir/datasetops/batch_node.h"
#ifndef ENABLE_ANDROID
@ -57,7 +74,6 @@
#endif
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/random.h"
#include "minddata/dataset/util/services.h"
@ -939,6 +955,7 @@ TFRecordDataset::TFRecordDataset(const std::vector<std::string> &dataset_files,
shard_id, shard_equal_rows, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}
#endif
} // namespace dataset
} // namespace mindspore

@ -16,11 +16,15 @@
#include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/core/client.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/runtime_context.h"
#include "minddata/dataset/include/datasets.h"
namespace mindspore {
namespace dataset {
Iterator::Iterator() : consumer_(nullptr) {}
Iterator::~Iterator() { Stop(); }
// Get the next row from the data pipeline.
bool Iterator::GetNextRow(TensorMap *row) {
Status rc = consumer_->GetNextAsMap(row);

@ -23,6 +23,8 @@
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
// IR non-leaf nodes
#include "minddata/dataset/engine/ir/datasetops/batch_node.h"
#include "minddata/dataset/engine/ir/datasetops/concat_node.h"

@ -21,6 +21,7 @@
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/include/constants.h"
namespace mindspore {
namespace dataset {

@ -39,6 +39,7 @@
#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
#endif
#include "minddata/dataset/core/data_type.h"
#include "minddata/dataset/util/path.h"
namespace mindspore {
@ -87,7 +88,7 @@ std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_p
}
std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const DataType &data_type) {
const std::string &data_type) {
auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);
return op->ValidateParams() ? op : nullptr;
@ -142,7 +143,7 @@ std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const
return op->ValidateParams() ? op : nullptr;
}
std::shared_ptr<ToNumberOperation> ToNumber(const DataType data_type) {
std::shared_ptr<ToNumberOperation> ToNumber(const std::string &data_type) {
auto op = std::make_shared<ToNumberOperation>(data_type);
return op->ValidateParams() ? op : nullptr;
@ -200,6 +201,19 @@ Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::s
return Status::OK();
}
// Helper functions to help validate data type passed by user
bool IsTypeNumeric(const std::string &data_type) {
if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" ||
data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" ||
data_type == "float16" || data_type == "float32" || data_type == "float64")
return true;
return false;
}
bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; }
bool IsTypeString(const std::string &data_type) { return data_type == "string"; }
/* ####################################### Derived TensorOperation classes ################################# */
// (In alphabetical order)
@ -239,6 +253,8 @@ BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &voc
preserve_unused_token_(preserve_unused_token),
with_offsets_(with_offsets) {}
BertTokenizerOperation::~BertTokenizerOperation() = default;
Status BertTokenizerOperation::ValidateParams() {
if (vocab_ == nullptr) {
std::string err_msg = "BertTokenizer: vocab object type is incorrect or null.";
@ -303,9 +319,11 @@ std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
// LookupOperation
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const DataType &data_type)
const std::string &data_type)
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
LookupOperation::~LookupOperation() = default;
Status LookupOperation::ValidateParams() {
if (vocab_ == nullptr) {
std::string err_msg = "Lookup: vocab object type is incorrect or null.";
@ -320,7 +338,7 @@ Status LookupOperation::ValidateParams() {
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
if (!data_type_.IsNumeric()) {
if (!IsTypeNumeric(data_type_)) {
std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
@ -330,7 +348,7 @@ Status LookupOperation::ValidateParams() {
}
std::shared_ptr<TensorOp> LookupOperation::Build() {
std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, data_type_);
std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_));
return tensor_op;
}
@ -419,6 +437,8 @@ std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() {
#endif
// SentencePieceTokenizerOperation
SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default;
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
SPieceTokenizerOutType out_type)
: vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
@ -482,11 +502,11 @@ std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
}
// ToNumberOperation
ToNumberOperation::ToNumberOperation(DataType data_type) : data_type_(data_type) {}
ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
Status ToNumberOperation::ValidateParams() {
if (!data_type_.IsNumeric() || data_type_.IsBool()) {
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_.ToString();
if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}

@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
// Possible values for SPieceTokenizerLoadType
enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };
// Possible values for SentencePieceModel
enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };
// Possible values for NormalizeForm
enum class NormalizeForm {
kNone = 0,

@ -19,6 +19,7 @@
#include <memory>
#include <mutex>
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/util/allocator.h"
#include "minddata/dataset/util/status.h"
@ -27,7 +28,6 @@ namespace mindspore {
namespace dataset {
// forward declare
class MemoryPool;
class ConfigManager;
class Tensor;
class CVTensor;

@ -22,8 +22,10 @@
#include <utility>
#include <vector>
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/tree_adapter.h"
#include "minddata/dataset/engine/datasetops/device_queue_op.h"
#include "minddata/dataset/engine/opt/pre/getter_pass.h"
#include "minddata/dataset/engine/tree_adapter.h"
#include "minddata/mindrecord/include/shard_index_generator.h"
#ifndef ENABLE_ANDROID
#include "minddata/mindrecord/include/shard_header.h"

@ -23,6 +23,7 @@
#include <unordered_map>
#include <vector>
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/include/datasets.h"
namespace mindspore {

@ -24,13 +24,26 @@
#include <utility>
#include <vector>
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/data_schema.h"
#include "minddata/dataset/engine/datasetops/filter_op.h"
#include "minddata/dataset/engine/datasetops/map_op/map_op.h"
#include "minddata/dataset/engine/datasetops/project_op.h"
#include "minddata/dataset/engine/datasetops/repeat_op.h"
#include "minddata/dataset/engine/datasetops/shuffle_op.h"
#include "minddata/dataset/engine/datasetops/skip_op.h"
#include "minddata/dataset/engine/datasetops/take_op.h"
#include "minddata/dataset/engine/ir/cache/dataset_cache.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace dataset {
class Dataset;
class DatasetCache;
class SamplerObj;
class IRNodePass;
class DatasetSizeGetter;

@ -22,6 +22,7 @@
#include <string>
#include <vector>
#include "minddata/dataset/engine/datasetops/source/mindrecord_op.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
namespace mindspore {

@ -24,6 +24,7 @@
#include "minddata/dataset/engine/datasetops/source/random_data_op.h"
#include "minddata/dataset/util/random.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace dataset {

@ -22,7 +22,9 @@
#include <utility>
#include <vector>
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/include/samplers.h"
namespace mindspore {
namespace dataset {

@ -20,8 +20,6 @@
#include <cstdint>
#include <string>
#include "minddata/dataset/util/log_adapter.h"
namespace mindspore {
namespace dataset {

@ -59,6 +59,9 @@ enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
// Possible values for SPieceTokenizerLoadType
enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };
// Possible values for SentencePieceModel
enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };
// Possible values for NormalizeForm
enum class NormalizeForm {
kNone = 0,

@ -17,6 +17,7 @@
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASETS_H_
#include <sys/stat.h>
#include <unistd.h>
#include <map>
#include <memory>
@ -26,27 +27,18 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "minddata/dataset/engine/ir/cache/dataset_cache.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/engine/consumers/tree_consumer.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/tensor.h"
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/include/type_id.h"
#include "minddata/dataset/kernels/c_func_op.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/util/path.h"
#ifndef ENABLE_ANDROID
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"
#endif
namespace mindspore {
namespace dataset {
class Tensor;
class TensorRow;
class TensorShape;
class TreeAdapter;
class TreeGetters;
@ -54,6 +46,7 @@ class TreeGetters;
class Vocab;
#endif
class DatasetCache;
class DatasetNode;
class Iterator;
@ -77,12 +70,20 @@ class ConcatDataset;
class RenameDataset;
#endif
#ifndef ENABLE_ANDROID
class SentencePieceVocab;
enum class SentencePieceModel;
#endif
class DSCallback;
class RepeatDataset;
#ifndef ENABLE_ANDROID
class SkipDataset;
class TakeDataset;
class ZipDataset;
#endif
/// \class Dataset datasets.h
@ -969,8 +970,12 @@ std::shared_ptr<TFRecordDataset> TFRecord(const std::vector<std::string> &datase
} else {
std::string schema_path = schema;
if (!schema_path.empty()) {
Path schema_file(schema_path);
if (!schema_file.Exists()) {
struct stat sb;
int rc = stat(common::SafeCStr(schema_path), &sb);
if (rc == -1 && errno != ENOENT) {
MS_LOG(WARNING) << "Unable to query the status of [" << schema_path << "]. Errno = " << errno << ".";
}
if (rc != 0) {
MS_LOG(ERROR) << "TFRecordDataset: schema path [" << schema_path << "] is invalid or does not exist.";
return nullptr;
}

@ -14,14 +14,14 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_
#include <string>
#include <vector>
#include <memory>
#include "include/ms_tensor.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/tensor.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace tensor {
class DETensor : public mindspore::tensor::MSTensor {
@ -79,4 +79,4 @@ class DETensor : public mindspore::tensor::MSTensor {
};
} // namespace tensor
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_DETENSOR_H_
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DETENSOR_H_

@ -14,12 +14,13 @@
* limitations under the License.
*/
#ifndef DATASET_API_EXECUTE_H_
#define DATASET_API_EXECUTE_H_
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_
#include <vector>
#include <memory>
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/include/constants.h"
#ifdef ENABLE_ANDROID
#include "minddata/dataset/include/de_tensor.h"
#endif
@ -55,4 +56,4 @@ class Execute {
} // namespace dataset
} // namespace mindspore
#endif // DATASET_API_EXECUTE_H_
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_EXECUTE_H_

@ -21,7 +21,6 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "minddata/dataset/engine/runtime_context.h"
#include "minddata/dataset/include/status.h"
namespace mindspore {
@ -45,10 +44,10 @@ using TensorVec = std::vector<std::shared_ptr<Tensor>>;
class Iterator {
public:
/// \brief Constructor
Iterator() : consumer_(nullptr) {}
Iterator();
/// \brief Destructor
~Iterator() { Stop(); }
~Iterator();
/// \brief Method for building and launching the pipeline.
/// \param[in] ops - a vector of DatasetOp in the data pipeline.

@ -21,10 +21,11 @@
#include <string>
#include <vector>
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/include/status.h"
#ifndef ENABLE_ANDROID
#include "minddata/mindrecord/include/shard_column.h"
#include "minddata/mindrecord/include/shard_error.h"
#include "minddata/mindrecord/include/shard_operator.h"
#include "minddata/mindrecord/include/shard_reader.h"
#endif

@ -51,6 +51,13 @@ namespace dataset {
} \
} while (false)
#define CHECK_FAIL_RETURN_SYNTAX_ERROR(_condition, _e) \
do { \
if (!(_condition)) { \
return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \
} \
} while (false)
#define RETURN_UNEXPECTED_IF_NULL(_ptr) \
do { \
if ((_ptr) == nullptr) { \
@ -71,6 +78,15 @@ namespace dataset {
return Status(StatusCode::kSyntaxError, __LINE__, __FILE__, _e); \
} while (false)
#define RETURN_SECOND_IF_ERROR(_s, _r) \
do { \
Status __rc = (_s); \
if (__rc.IsError()) { \
MS_LOG(ERROR) << __rc; \
return _r; \
} \
} while (false)
enum class StatusCode : char {
kOK = 0,
kOutOfMemory = 1,
@ -151,6 +167,12 @@ class Status {
StatusCode code_;
std::string err_msg_;
};
#if !defined(_WIN32) && !defined(_WIN64)
const float MAX_MEMORY_USAGE_THRESHOLD = 0.95;
float GetMemoryUsage();
#endif
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_STATUS_H_

@ -22,18 +22,16 @@
#include <utility>
#include <vector>
#include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/include/constants.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"
namespace mindspore {
namespace dataset {
class Vocab;
class SentencePieceVocab;
// Transform operations for text
namespace text {
@ -146,10 +144,11 @@ std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_p
/// \param[in] vocab a Vocab object.
/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
/// If unknown_token is oov, runtime error will be thrown.
/// \param[in] DataType type of the tensor after lookup, typically int32.
/// \param[in] data_type type of the tensor after lookup, typically int32.
/// \return Shared pointer to the current TensorOperation.
std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const mindspore::dataset::DataType &data_type = DataType("int32"));
const std::string &data_type = "int32");
/// \brief TensorOp to generate n-gram from a 1-D string Tensor.
/// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
@ -226,9 +225,9 @@ std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const
/// https://en.cppreference.com/w/cpp/string/basic_string/stof,
/// https://en.cppreference.com/w/cpp/string/basic_string/stoul,
/// except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
/// \param[in] data_type DataType of the tensor to be casted to. Must be a numeric type.
/// \param[in] data_type of the tensor to be casted to. Must be a numeric type.
/// \return Shared pointer to the current TensorOperation.
std::shared_ptr<ToNumberOperation> ToNumber(const DataType data_type);
std::shared_ptr<ToNumberOperation> ToNumber(const std::string &data_type);
/// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
/// \param[in] max_length Maximum length required.
@ -285,7 +284,7 @@ class BertTokenizerOperation : public TensorOperation {
bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
bool with_offsets);
~BertTokenizerOperation() = default;
~BertTokenizerOperation();
std::shared_ptr<TensorOp> Build() override;
@ -342,9 +341,9 @@ class JiebaTokenizerOperation : public TensorOperation {
class LookupOperation : public TensorOperation {
public:
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
const DataType &data_type);
const std::string &data_type);
~LookupOperation() = default;
~LookupOperation();
std::shared_ptr<TensorOp> Build() override;
@ -356,7 +355,7 @@ class LookupOperation : public TensorOperation {
std::shared_ptr<Vocab> vocab_;
std::string unknown_token_;
int32_t default_id_;
DataType data_type_;
std::string data_type_;
};
class NgramOperation : public TensorOperation {
@ -439,7 +438,7 @@ class SentencePieceTokenizerOperation : public TensorOperation {
SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
~SentencePieceTokenizerOperation() = default;
~SentencePieceTokenizerOperation();
std::shared_ptr<TensorOp> Build() override;
@ -473,7 +472,7 @@ class SlidingWindowOperation : public TensorOperation {
class ToNumberOperation : public TensorOperation {
public:
explicit ToNumberOperation(DataType data_type);
explicit ToNumberOperation(std::string data_type);
~ToNumberOperation() = default;
@ -484,7 +483,7 @@ class ToNumberOperation : public TensorOperation {
std::string Name() const override { return kToNumberOperation; }
private:
DataType data_type_;
std::string data_type_;
};
class TruncateSequencePairOperation : public TensorOperation {

@ -16,7 +16,6 @@
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
#include "minddata/dataset/core/data_type.h"
#include "mindspore/core/ir/dtype/type_id.h"
namespace mindspore {

@ -22,10 +22,11 @@
#include <vector>
#include <unordered_map>
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/include/constants.h"
namespace mindspore {
namespace dataset {
enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };
class SentencePieceVocab {
public:
static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save