!11248 dataset: Use int32_t for text's vocab_size

From: @cathwong
Reviewed-by: @mikef,@robingrosman
Signed-off-by:
pull/11248/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 8ef663fa68

@ -569,7 +569,7 @@ std::shared_ptr<Dataset> Dataset::SetNumWorkers(int32_t num_workers) {
#ifndef ENABLE_ANDROID
std::shared_ptr<SentencePieceVocab> Dataset::BuildSentencePieceVocab(
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage,
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> &params) {
auto vocab = std::make_shared<SentencePieceVocab>();
auto ds = std::make_shared<BuildSentenceVocabNode>(IRNode(), vocab, col_names, vocab_size, character_coverage,

@ -388,7 +388,7 @@ PYBIND_REGISTER(BuildSentenceVocabNode, 2, ([](const py::module *m) {
(void)py::class_<BuildSentenceVocabNode, DatasetNode, std::shared_ptr<BuildSentenceVocabNode>>(
*m, "BuildSentenceVocabNode", "to create a BuildSentenceVocabNode")
.def(py::init([](std::shared_ptr<DatasetNode> self, std::shared_ptr<SentencePieceVocab> vocab,
const std::vector<std::string> &col_names, uint32_t vocab_size,
const std::vector<std::string> &col_names, int32_t vocab_size,
float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params) {
auto build_sentence_vocab = std::make_shared<BuildSentenceVocabNode>(

@ -54,7 +54,7 @@ PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) {
(void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab")
.def(py::init<>())
.def_static("from_file",
[](const py::list &paths, const int vocab_size, const float character_coverage,
[](const py::list &paths, const int32_t vocab_size, const float character_coverage,
const SentencePieceModel model_type, const py::dict &params) {
std::shared_ptr<SentencePieceVocab> v;
std::vector<std::string> path_list;

@ -23,7 +23,7 @@
namespace mindspore {
namespace dataset {
BuildSentencePieceVocabOp::BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab,
std::vector<std::string> col_names, uint32_t vocab_size,
std::vector<std::string> col_names, int32_t vocab_size,
float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
int32_t op_conn_size)

@ -134,7 +134,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
};
BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
uint32_t vocab_size, float character_coverage, SentencePieceModel model_type,
int32_t vocab_size, float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params, int32_t op_conn_size);
~BuildSentencePieceVocabOp() = default;
@ -174,7 +174,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
private:
bool read_done_;
Status ret_status_;
uint32_t vocab_size_;
int32_t vocab_size_;
float character_coverage_;
SentencePieceModel model_type_;
std::unordered_map<std::string, std::string> params_;

@ -30,7 +30,7 @@ namespace dataset {
BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child,
std::shared_ptr<SentencePieceVocab> vocab,
const std::vector<std::string> &col_names, uint32_t vocab_size,
const std::vector<std::string> &col_names, int32_t vocab_size,
float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params)
: vocab_(vocab),

@ -33,7 +33,7 @@ class BuildSentenceVocabNode : public DatasetNode {
public:
/// \brief Constructor
BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child, std::shared_ptr<SentencePieceVocab> vocab,
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage,
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> &params);
/// \brief Destructor
@ -75,7 +75,7 @@ class BuildSentenceVocabNode : public DatasetNode {
private:
std::shared_ptr<SentencePieceVocab> vocab_;
std::vector<std::string> col_names_;
uint32_t vocab_size_;
int32_t vocab_size_;
float character_coverage_;
SentencePieceModel model_type_;
std::unordered_map<std::string, std::string> params_;

@ -225,7 +225,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// \brief Function to create a SentencePieceVocab from source dataset
/// \notes Build a SentencePieceVocab from a dataset.
/// \param[in] col_names Column names to get words from. It can be a vector of column names
/// \param[in] vocab_size Vocabulary size. The type is uint32
/// \param[in] vocab_size Vocabulary size.
/// \param[in] character_coverage Percentage of characters covered by the model, must be between
/// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like
/// Japanese or Chinese character sets, and 1.0 for other languages with small character sets.
@ -233,7 +233,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// The input sentence must be pretokenized when using word type.
/// \param[in] params A vector contains more option parameters of sentencepiece library
std::shared_ptr<SentencePieceVocab> BuildSentencePieceVocab(
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage,
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> &params);
/// \brief Function to create a Vocab from source dataset

@ -28,7 +28,7 @@ namespace dataset {
SentencePieceVocab::SentencePieceVocab() : model_proto_("") {}
Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
const float character_coverage, const SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
std::shared_ptr<SentencePieceVocab> *vocab) {

@ -29,7 +29,7 @@ namespace dataset {
class SentencePieceVocab {
public:
static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
const float character_coverage, const SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
std::shared_ptr<SentencePieceVocab> *vocab);

@ -225,7 +225,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
// Expected failure: vocab_size shoule be either -1 or positive integer
// Expected failure: vocab_size should be either -1 or positive integer
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
EXPECT_NE(s, Status::OK());
}

Loading…
Cancel
Save