|
|
|
@ -46,73 +46,50 @@ namespace dataset {
|
|
|
|
|
#ifdef ENABLE_ICU4C
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(BasicTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
|
|
|
|
|
*m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
|
|
|
|
|
.def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>(),
|
|
|
|
|
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
|
|
|
|
|
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
|
|
|
|
|
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
|
|
|
|
|
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
|
|
|
|
|
py::arg("with_offsets") = BasicTokenizerOp::kDefWithOffsets);
|
|
|
|
|
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(*m,
|
|
|
|
|
"BasicTokenizerOp")
|
|
|
|
|
.def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(WhitespaceTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
|
|
|
|
|
*m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
|
|
|
|
|
.def(py::init<const bool &>(), py::arg(" with_offsets ") = WhitespaceTokenizerOp::kDefWithOffsets);
|
|
|
|
|
*m, "WhitespaceTokenizerOp")
|
|
|
|
|
.def(py::init<const bool &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(UnicodeScriptTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
|
|
|
|
|
*m, "UnicodeScriptTokenizerOp",
|
|
|
|
|
"Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
|
|
|
|
|
*m, "UnicodeScriptTokenizerOp")
|
|
|
|
|
.def(py::init<>())
|
|
|
|
|
.def(py::init<const bool &, const bool &>(),
|
|
|
|
|
py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace,
|
|
|
|
|
py::arg("with_offsets") = UnicodeScriptTokenizerOp::kDefWithOffsets);
|
|
|
|
|
.def(py::init<const bool &, const bool &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(CaseFoldOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
|
|
|
|
|
*m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
|
|
|
|
|
.def(py::init<>());
|
|
|
|
|
}));
|
|
|
|
|
PYBIND_REGISTER(
|
|
|
|
|
CaseFoldOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(*m, "CaseFoldOp").def(py::init<>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(NormalizeUTF8Op, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(
|
|
|
|
|
*m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.")
|
|
|
|
|
(void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(*m, "NormalizeUTF8Op")
|
|
|
|
|
.def(py::init<>())
|
|
|
|
|
.def(py::init<NormalizeForm>(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm);
|
|
|
|
|
.def(py::init<NormalizeForm>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(RegexReplaceOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(
|
|
|
|
|
*m, "RegexReplaceOp",
|
|
|
|
|
"Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.")
|
|
|
|
|
.def(py::init<const std::string &, const std::string &, bool>(), py::arg("pattern"),
|
|
|
|
|
py::arg("replace"), py::arg("replace_all"));
|
|
|
|
|
(void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(*m, "RegexReplaceOp")
|
|
|
|
|
.def(py::init<const std::string &, const std::string &, bool>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(RegexTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
|
|
|
|
|
*m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
|
|
|
|
|
.def(py::init<const std::string &, const std::string &, const bool &>(), py::arg("delim_pattern"),
|
|
|
|
|
py::arg("keep_delim_pattern"), py::arg("with_offsets") = RegexTokenizerOp::kDefWithOffsets);
|
|
|
|
|
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(*m,
|
|
|
|
|
"RegexTokenizerOp")
|
|
|
|
|
.def(py::init<const std::string &, const std::string &, const bool &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(BertTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(
|
|
|
|
|
*m, "BertTokenizerOp", "Tokenizer used for Bert text process.")
|
|
|
|
|
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp")
|
|
|
|
|
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
|
|
|
|
|
const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>(),
|
|
|
|
|
py::arg("vocab"),
|
|
|
|
|
py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
|
|
|
|
|
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
|
|
|
|
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
|
|
|
|
|
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
|
|
|
|
|
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
|
|
|
|
|
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
|
|
|
|
|
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
|
|
|
|
|
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
|
|
|
|
|
const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) {
|
|
|
|
@ -128,11 +105,9 @@ PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) {
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(JiebaTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(
|
|
|
|
|
*m, "JiebaTokenizerOp", "")
|
|
|
|
|
.def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>(),
|
|
|
|
|
py::arg("hmm_path"), py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix,
|
|
|
|
|
py::arg("with_offsets") = JiebaTokenizerOp::kDefWithOffsets)
|
|
|
|
|
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m,
|
|
|
|
|
"JiebaTokenizerOp")
|
|
|
|
|
.def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>())
|
|
|
|
|
.def("add_word", [](JiebaTokenizerOp &self, const std::string word, int freq) {
|
|
|
|
|
THROW_IF_ERROR(self.AddWord(word, freq));
|
|
|
|
|
});
|
|
|
|
@ -140,13 +115,12 @@ PYBIND_REGISTER(JiebaTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(UnicodeCharTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>(
|
|
|
|
|
*m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.")
|
|
|
|
|
.def(py::init<const bool &>(), py::arg("with_offsets") = UnicodeCharTokenizerOp::kDefWithOffsets);
|
|
|
|
|
*m, "UnicodeCharTokenizerOp")
|
|
|
|
|
.def(py::init<const bool &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(
|
|
|
|
|
*m, "LookupOp", "Tensor operation to LookUp each word.")
|
|
|
|
|
(void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp")
|
|
|
|
|
.def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word) {
|
|
|
|
|
if (vocab == nullptr) {
|
|
|
|
|
THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "vocab object type is incorrect or null."));
|
|
|
|
@ -165,56 +139,42 @@ PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) {
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(NgramOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<NgramOp, TensorOp, std::shared_ptr<NgramOp>>(*m, "NgramOp",
|
|
|
|
|
"TensorOp performs ngram mapping.")
|
|
|
|
|
(void)py::class_<NgramOp, TensorOp, std::shared_ptr<NgramOp>>(*m, "NgramOp")
|
|
|
|
|
.def(py::init<const std::vector<int32_t> &, int32_t, int32_t, const std::string &,
|
|
|
|
|
const std::string &, const std::string &>(),
|
|
|
|
|
py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"),
|
|
|
|
|
py::arg("r_pad_token"), py::arg("separator"));
|
|
|
|
|
const std::string &, const std::string &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(
|
|
|
|
|
WordpieceTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
|
|
|
|
|
*m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
|
|
|
|
|
.def(
|
|
|
|
|
py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &>(),
|
|
|
|
|
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
|
|
|
|
|
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
|
|
|
|
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
|
|
|
|
|
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
|
|
|
|
|
}));
|
|
|
|
|
PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
|
|
|
|
|
*m, "WordpieceTokenizerOp")
|
|
|
|
|
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &,
|
|
|
|
|
const bool &>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(SlidingWindowOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<SlidingWindowOp, TensorOp, std::shared_ptr<SlidingWindowOp>>(
|
|
|
|
|
*m, "SlidingWindowOp", "TensorOp to apply sliding window to a 1-D Tensor.")
|
|
|
|
|
.def(py::init<uint32_t, int32_t>(), py::arg("width"), py::arg("axis"));
|
|
|
|
|
(void)py::class_<SlidingWindowOp, TensorOp, std::shared_ptr<SlidingWindowOp>>(*m, "SlidingWindowOp")
|
|
|
|
|
.def(py::init<uint32_t, int32_t>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(
|
|
|
|
|
SentencePieceTokenizerOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<SentencePieceTokenizerOp, TensorOp, std::shared_ptr<SentencePieceTokenizerOp>>(
|
|
|
|
|
*m, "SentencePieceTokenizerOp", "Tokenize scalar token or 1-D tokens to tokens by sentence piece.")
|
|
|
|
|
*m, "SentencePieceTokenizerOp")
|
|
|
|
|
.def(
|
|
|
|
|
py::init<std::shared_ptr<SentencePieceVocab> &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>(),
|
|
|
|
|
py::arg("vocab"), py::arg("load_type") = SPieceTokenizerLoadType::kModel,
|
|
|
|
|
py::arg("out_type") = SPieceTokenizerOutType::kString)
|
|
|
|
|
py::init<std::shared_ptr<SentencePieceVocab> &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>())
|
|
|
|
|
.def(py::init<const std::string &, const std::string &, const SPieceTokenizerLoadType,
|
|
|
|
|
const SPieceTokenizerOutType>(),
|
|
|
|
|
py::arg("model_path"), py::arg("model_filename"), py::arg("load_type") = SPieceTokenizerLoadType::kFile,
|
|
|
|
|
py::arg("out_type") = SPieceTokenizerOutType::kString);
|
|
|
|
|
const SPieceTokenizerOutType>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(ToNumberOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<ToNumberOp, TensorOp, std::shared_ptr<ToNumberOp>>(
|
|
|
|
|
*m, "ToNumberOp", "TensorOp to convert strings to numbers.")
|
|
|
|
|
.def(py::init<DataType>(), py::arg("data_type"))
|
|
|
|
|
.def(py::init<std::string>(), py::arg("data_type"));
|
|
|
|
|
(void)py::class_<ToNumberOp, TensorOp, std::shared_ptr<ToNumberOp>>(*m, "ToNumberOp")
|
|
|
|
|
.def(py::init<DataType>())
|
|
|
|
|
.def(py::init<std::string>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
PYBIND_REGISTER(TruncateSequencePairOp, 1, ([](const py::module *m) {
|
|
|
|
|
(void)py::class_<TruncateSequencePairOp, TensorOp, std::shared_ptr<TruncateSequencePairOp>>(
|
|
|
|
|
*m, "TruncateSequencePairOp", "Tensor operation to truncate two tensors to a max_length")
|
|
|
|
|
*m, "TruncateSequencePairOp")
|
|
|
|
|
.def(py::init<int64_t>());
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|