|
|
|
@ -225,7 +225,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|
|
|
|
/// \brief Function to create a SentencePieceVocab from source dataset
|
|
|
|
|
/// \notes Build a SentencePieceVocab from a dataset.
|
|
|
|
|
/// \param[in] col_names Column names to get words from. It can be a vector of column names
|
|
|
|
|
/// \param[in] vocab_size Vocabulary size. The type is uint32
|
|
|
|
|
/// \param[in] vocab_size Vocabulary size.
|
|
|
|
|
/// \param[in] character_coverage Percentage of characters covered by the model, must be between
|
|
|
|
|
/// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like
|
|
|
|
|
/// Japanese or Chinese character sets, and 1.0 for other languages with small character sets.
|
|
|
|
@ -233,7 +233,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|
|
|
|
/// The input sentence must be pretokenized when using word type.
|
|
|
|
|
/// \param[in] params A vector contains more option parameters of sentencepiece library
|
|
|
|
|
std::shared_ptr<SentencePieceVocab> BuildSentencePieceVocab(
|
|
|
|
|
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
|
|
|
|
|
const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage,
|
|
|
|
|
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms);
|
|
|
|
|
|
|
|
|
|
/// \brief Function to create a Vocab from source dataset
|
|
|
|
|