parent
8e4c0a9d93
commit
18b519ae0f
@ -0,0 +1,25 @@
|
||||
if (WIN32)
|
||||
set(sentencepiece_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 -Wno-unused-result -Wno-stringop-overflow -Wno-format-extra-args -Wno-format")
|
||||
set(sentencepiece_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
|
||||
mindspore_add_pkg(sentencepiece
|
||||
VER 0.1.92
|
||||
LIBS sentencepiece sentencepiece_train
|
||||
URL https://github.com/google/sentencepiece/archive/v0.1.92.tar.gz
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DSPM_USE_BUILTIN_PROTOBUF=ON
|
||||
MD5 5dfd2241914b5598a68b2a8542ed8e91
|
||||
)
|
||||
else ()
|
||||
set(sentencepiece_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 -Wno-unused-result -Wno-sign-compare")
|
||||
set(sentencepiece_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
|
||||
mindspore_add_pkg(sentencepiece
|
||||
VER 0.1.92
|
||||
LIBS sentencepiece sentencepiece_train
|
||||
URL https://github.com/google/sentencepiece/archive/v0.1.92.tar.gz
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DSPM_USE_BUILTIN_PROTOBUF=OFF -DSPM_ENABLE_SHARED=OFF -DPROTOBUF_INC=${protobuf_INC}
|
||||
MD5 5dfd2241914b5598a68b2a8542ed8e91
|
||||
PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/sentencepiece/sentencepiece.patch001
|
||||
)
|
||||
endif ()
|
||||
include_directories(${sentencepiece_INC})
|
||||
add_library(mindspore::sentencepiece ALIAS sentencepiece::sentencepiece)
|
||||
add_library(mindspore::sentencepiece_train ALIAS sentencepiece::sentencepiece_train)
|
@ -0,0 +1,193 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
|
||||
|
||||
#include <iomanip>
|
||||
#include "minddata/dataset/core/config_manager.h"
|
||||
#include "minddata/dataset/engine/opt/pass.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
BuildSentencePieceVocabOp::BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab,
|
||||
std::vector<std::string> col_names, uint32_t vocab_size,
|
||||
float character_coverage, SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms,
|
||||
int32_t op_conn_size)
|
||||
: PipelineOp(op_conn_size),
|
||||
vocab_size_(vocab_size),
|
||||
vocab_(vocab),
|
||||
col_names_(col_names),
|
||||
character_coverage_(character_coverage),
|
||||
model_type_(model_type),
|
||||
params_(params),
|
||||
col_id_(0) {
|
||||
sentence_queue_ = std::make_unique<Queue<TensorRow>>(op_conn_size);
|
||||
read_done_ = false;
|
||||
ret_status_ = Status::OK();
|
||||
}
|
||||
|
||||
Status BuildSentencePieceVocabOp::operator()() {
|
||||
RETURN_UNEXPECTED_IF_NULL(tree_);
|
||||
RETURN_IF_NOT_OK(sentence_queue_->Register(tree_->AllTasks()));
|
||||
RETURN_IF_NOT_OK(
|
||||
tree_->AllTasks()->CreateAsyncTask("sentenceTask", std::bind(&BuildSentencePieceVocabOp::SentenceThread, this)));
|
||||
TaskManager::FindMe()->Post();
|
||||
child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
|
||||
TensorRow new_row;
|
||||
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
|
||||
|
||||
bool eoe_warning = false; // give out warning if receive more than 1 eoe
|
||||
while (child_iterator_->eof_handled() == false) {
|
||||
while (new_row.empty() == false) {
|
||||
RETURN_IF_NOT_OK(sentence_queue_->EmplaceBack(new_row));
|
||||
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(!eoe_warning, "no op should be after from_dataset (repeat detected)");
|
||||
eoe_warning = true;
|
||||
}
|
||||
// add empty tensorRow for quit
|
||||
TensorRow empty_row = {};
|
||||
sentence_queue_->EmplaceBack(empty_row);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BuildSentencePieceVocabOp::SentenceThread() {
|
||||
TaskManager::FindMe()->Post();
|
||||
if (col_names_.empty() == true) {
|
||||
auto itr = column_name_id_map_.find("text");
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(itr != column_name_id_map_.end(),
|
||||
"'text' column doesn't exist when column name is empty");
|
||||
col_id_ = itr->second;
|
||||
} else {
|
||||
auto itr = column_name_id_map_.find(col_names_[0]);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(itr != column_name_id_map_.end(), col_names_[0] + "column doesn't exist");
|
||||
col_id_ = itr->second;
|
||||
}
|
||||
std::unique_ptr<DatasetSentenceIterator> sentence_iter = std::make_unique<DatasetSentenceIterator>(this);
|
||||
std::string model_proto;
|
||||
sentencepiece::util::Status s_status =
|
||||
sentencepiece::SentencePieceTrainer::Train(BuildParams(), sentence_iter.get(), &model_proto);
|
||||
if (!s_status.ok()) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, s_status.message());
|
||||
} else {
|
||||
if (vocab_ == nullptr) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "sentencepiece vocab ptr must not be nullptr");
|
||||
}
|
||||
vocab_->set_model_proto(model_proto);
|
||||
}
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(0, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
|
||||
RETURN_IF_NOT_OK(out_connector_->Add(0, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, std::string> BuildSentencePieceVocabOp::BuildParams() {
|
||||
std::unordered_map<std::string, std::string> ret_params;
|
||||
ret_params["vocab_size"] = std::to_string(vocab_size_);
|
||||
ret_params["character_coverage"] = std::to_string(character_coverage_);
|
||||
if (model_type_ == SentencePieceModel::kBpe) {
|
||||
ret_params["model_type"] = "BPE";
|
||||
} else if (model_type_ == SentencePieceModel::kChar) {
|
||||
ret_params["model_type"] = "CHAR";
|
||||
} else if (model_type_ == SentencePieceModel::kWord) {
|
||||
ret_params["model_type"] = "WORD";
|
||||
} else {
|
||||
ret_params["model_type"] = "UNIGRAM";
|
||||
}
|
||||
// filter some params that set by function param
|
||||
// filter model_prefix that must be empty
|
||||
for (auto param : params_) {
|
||||
std::string key = param.first;
|
||||
if (key == "input" || key == "vocab_size" || key == "model_prefix" || key == "character_coverage" ||
|
||||
key == "model_type") {
|
||||
continue;
|
||||
}
|
||||
ret_params[key] = param.second;
|
||||
}
|
||||
|
||||
ret_params["model_prefix"] = "";
|
||||
ret_params["minloglevel"] = "1";
|
||||
return ret_params;
|
||||
}
|
||||
|
||||
bool BuildSentencePieceVocabOp::Done() { return read_done_; }
|
||||
|
||||
void BuildSentencePieceVocabOp::Next(std::string *sentence) {
|
||||
TensorRow new_row;
|
||||
Status s = sentence_queue_->PopFront(&new_row);
|
||||
|
||||
if (s.IsError()) {
|
||||
read_done_ = true;
|
||||
ret_status_ = s;
|
||||
return;
|
||||
}
|
||||
if (new_row.empty() == true) {
|
||||
read_done_ = true;
|
||||
ret_status_ = Status::OK();
|
||||
return;
|
||||
}
|
||||
|
||||
if (new_row[col_id_]->type().IsNumeric() || new_row[col_id_]->Rank() > 1) {
|
||||
ret_status_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
|
||||
"for dataset only words on string columns or must bu scalar");
|
||||
read_done_ = true;
|
||||
return;
|
||||
}
|
||||
|
||||
std::string_view sentence_v;
|
||||
new_row[col_id_]->GetItemAt(&sentence_v, {});
|
||||
|
||||
std::string st{sentence_v};
|
||||
*sentence = st;
|
||||
ret_status_ = Status::OK();
|
||||
}
|
||||
|
||||
// Pre-Visitor accept method for NodePass
|
||||
Status BuildSentencePieceVocabOp::PreAccept(NodePass *p, bool *modified) {
|
||||
// Downcast shared pointer then call the pre-visitation
|
||||
return p->PreRunOnNode(shared_from_base<BuildSentencePieceVocabOp>(), modified);
|
||||
}
|
||||
|
||||
Status BuildSentencePieceVocabOp::Builder::Build(std::shared_ptr<BuildSentencePieceVocabOp> *op) {
|
||||
(*op) = std::make_shared<BuildSentencePieceVocabOp>(builder_vocab_, builder_col_names_, builder_vocab_size_,
|
||||
builder_character_coverage_, builder_model_type_, builder_params_,
|
||||
builder_connector_size_);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
BuildSentencePieceVocabOp::Builder::Builder() {
|
||||
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
|
||||
builder_connector_size_ = cfg->op_connector_size();
|
||||
}
|
||||
|
||||
BuildSentencePieceVocabOp::DatasetSentenceIterator::DatasetSentenceIterator(BuildSentencePieceVocabOp *s_p_vocab_ptr)
|
||||
: s_p_vocab_ptr_(s_p_vocab_ptr) {}
|
||||
|
||||
bool BuildSentencePieceVocabOp::DatasetSentenceIterator::done() const {
|
||||
if (s_p_vocab_ptr_ == nullptr) {
|
||||
return true;
|
||||
}
|
||||
return s_p_vocab_ptr_->Done();
|
||||
}
|
||||
|
||||
void BuildSentencePieceVocabOp::DatasetSentenceIterator::Next() {
|
||||
if (s_p_vocab_ptr_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
s_p_vocab_ptr_->Next(&value_);
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,186 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef DATASET_ENGINE_DATASETOPS_BUILD_SENTENCE_VOCAB_OP_H_
|
||||
#define DATASET_ENGINE_DATASETOPS_BUILD_SENTENCE_VOCAB_OP_H_
|
||||
|
||||
#include <sentencepiece_trainer.h>
|
||||
#include <sentencepiece_processor.h>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/engine/dataset_iterator.h"
|
||||
#include "minddata/dataset/engine/datasetops/pipeline_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/util/queue.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace py = pybind11;
|
||||
|
||||
class BuildSentencePieceVocabOp : public PipelineOp {
|
||||
public:
|
||||
class Builder {
|
||||
public:
|
||||
Builder();
|
||||
|
||||
// Destructor.
|
||||
~Builder() = default;
|
||||
|
||||
// Setter method
|
||||
// @param uint32_t size
|
||||
// @return Builder setter method returns reference to the builder.
|
||||
Builder &SetOpConnectorSize(uint32_t size) {
|
||||
builder_connector_size_ = size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Setter method
|
||||
// @param uint32_t size
|
||||
// @return Builder & reference to builder class object
|
||||
Builder &SetVocabSize(uint32_t size) {
|
||||
builder_vocab_size_ = size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Setter method
|
||||
// @param float charactor corverage - to determine the minimum symbols
|
||||
// @return Builder & reference to builder class object
|
||||
Builder &SetCharacterCoverage(float character_coverage) {
|
||||
builder_character_coverage_ = character_coverage;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Setter method
|
||||
// @param SentencePieceModel model_type - model algorithm
|
||||
// @return Builder & reference to builder class object
|
||||
Builder &SetModelType(SentencePieceModel model_type) {
|
||||
builder_model_type_ = model_type;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Setter method
|
||||
// @param std::unordered_map<std::string, std::string> params
|
||||
// @return Builder & reference to builder class object
|
||||
Builder &SetParams(std::unordered_map<std::string, std::string> params) {
|
||||
builder_params_ = params;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Setter method
|
||||
// @param std::shared_ptr<SentencePieceVocab> vocab
|
||||
// @return Builder & reference to builder class object
|
||||
Builder &SetVocab(std::shared_ptr<SentencePieceVocab> vocab) {
|
||||
builder_vocab_ = vocab;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// set columns names
|
||||
// @param const std::vector<std::string> & col_names - name of columns to get words
|
||||
// @return Builder & reference to builder class object
|
||||
Builder &SetColumnNames(const std::vector<std::string> &col_names) {
|
||||
builder_col_names_ = col_names;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// The builder "build" method creates the final object.
|
||||
// @param std::shared_ptr<BuildVocabOp> *op - DatasetOp
|
||||
// @return - The error code return
|
||||
Status Build(std::shared_ptr<BuildSentencePieceVocabOp> *op);
|
||||
|
||||
private:
|
||||
uint32_t builder_connector_size_;
|
||||
uint32_t builder_vocab_size_;
|
||||
float builder_character_coverage_;
|
||||
SentencePieceModel builder_model_type_;
|
||||
std::unordered_map<std::string, std::string> builder_params_;
|
||||
std::vector<std::string> builder_col_names_;
|
||||
std::shared_ptr<SentencePieceVocab> builder_vocab_;
|
||||
};
|
||||
|
||||
public:
|
||||
class DatasetSentenceIterator : public sentencepiece::SentenceIterator {
|
||||
public:
|
||||
explicit DatasetSentenceIterator(BuildSentencePieceVocabOp *s_p_vocab_ptr);
|
||||
~DatasetSentenceIterator() {}
|
||||
|
||||
bool done() const override;
|
||||
void Next() override;
|
||||
const std::string &value() const override { return value_; }
|
||||
sentencepiece::util::Status status() const override { return sentencepiece::util::Status(); }
|
||||
|
||||
private:
|
||||
std::string value_;
|
||||
BuildSentencePieceVocabOp *s_p_vocab_ptr_;
|
||||
};
|
||||
|
||||
BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
|
||||
uint32_t vocab_size, float character_coverage, SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms, int32_t op_conn_size);
|
||||
|
||||
~BuildSentencePieceVocabOp() = default;
|
||||
|
||||
// the thread for sentence train
|
||||
Status SentenceThread();
|
||||
|
||||
Status EofReceived(int32_t) override { return Status::OK(); }
|
||||
|
||||
Status EoeReceived(int32_t) override { return Status::OK(); }
|
||||
|
||||
Status operator()() override;
|
||||
|
||||
// Getter
|
||||
// @return the number of workers
|
||||
int32_t num_producers() const override { return 1; }
|
||||
|
||||
// Getter
|
||||
// @return the number of threads consuming from the previous Connector
|
||||
int32_t num_consumers() const override { return 1; }
|
||||
|
||||
Status Reset() override { RETURN_STATUS_UNEXPECTED("Reset shouldn't be called in BuildSentencePieceVocabOp"); }
|
||||
|
||||
// build the input params for sentence api
|
||||
std::unordered_map<std::string, std::string> BuildParams();
|
||||
|
||||
bool Done();
|
||||
void Next(std::string *sentence);
|
||||
|
||||
/// \param[in] p The node to visit
|
||||
/// \param[out] modified Indicator if the node was modified
|
||||
/// \return Status of the node visit
|
||||
Status PreAccept(NodePass *p, bool *modified) override;
|
||||
|
||||
private:
|
||||
bool read_done_;
|
||||
Status ret_status_;
|
||||
uint32_t vocab_size_;
|
||||
float character_coverage_;
|
||||
SentencePieceModel model_type_;
|
||||
std::unordered_map<std::string, std::string> params_;
|
||||
std::shared_ptr<SentencePieceVocab> vocab_;
|
||||
std::vector<std::string> col_names_;
|
||||
uint32_t col_id_;
|
||||
std::unique_ptr<ChildIterator> child_iterator_; // child iterator for fetching TensorRows 1 by 1
|
||||
std::unique_ptr<Queue<TensorRow>> sentence_queue_; // master thread assigns each worker TensorRow via this
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_ENGINE_DATASETOPS_BUILD_SENTENCE_VOCAB_OP_H_
|
@ -0,0 +1,99 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "common/utils.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::shared_ptr<SentencePieceVocab> vocab,
|
||||
const SPieceTokenizerLoadType load_type,
|
||||
const SPieceTokenizerOutType out_type)
|
||||
: vocab_(vocab), load_type_(load_type), out_type_(out_type) {}
|
||||
|
||||
SentencePieceTokenizerOp::SentencePieceTokenizerOp(const std::string &model_path, const std::string &model_filename,
|
||||
const SPieceTokenizerLoadType load_type,
|
||||
const SPieceTokenizerOutType out_type)
|
||||
: load_type_(load_type), out_type_(out_type) {
|
||||
(void)GetModelRealPath(model_path, model_filename);
|
||||
}
|
||||
|
||||
Status SentencePieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor");
|
||||
}
|
||||
|
||||
std::string_view sentence_v;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {}));
|
||||
std::string sentence{sentence_v};
|
||||
if (load_type_ == SPieceTokenizerLoadType::kFile) {
|
||||
auto status = processor_.Load(file_path_);
|
||||
if (!status.ok()) {
|
||||
RETURN_STATUS_UNEXPECTED("load sentence piece model failed.");
|
||||
}
|
||||
} else {
|
||||
RETURN_UNEXPECTED_IF_NULL(vocab_);
|
||||
auto status = processor_.LoadFromSerializedProto(vocab_.get()->model_proto());
|
||||
if (!status.ok()) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence piece load model failed.");
|
||||
}
|
||||
}
|
||||
|
||||
if (out_type_ == SPieceTokenizerOutType::kString) {
|
||||
std::vector<std::string> pieces;
|
||||
auto status = processor_.Encode(sentence, &pieces);
|
||||
if (!status.ok()) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence piece tokenizer error");
|
||||
}
|
||||
*output = std::make_unique<Tensor>(pieces, TensorShape({(dsize_t)pieces.size()}));
|
||||
} else {
|
||||
std::vector<int> ids;
|
||||
auto status = processor_.Encode(sentence, &ids);
|
||||
if (!status.ok()) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence piece tokenizer error");
|
||||
}
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(output, ids, TensorShape({(dsize_t)ids.size()})));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SentencePieceTokenizerOp::GetModelRealPath(const std::string &model_path, const std::string &filename) {
|
||||
char real_path[PATH_MAX] = {0};
|
||||
if (file_path_.size() >= PATH_MAX) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence piece model path is invalid.");
|
||||
}
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (_fullpath(real_path, common::SafeCStr(model_path), PATH_MAX) == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence piece model path is invalid.");
|
||||
}
|
||||
#else
|
||||
if (realpath(common::SafeCStr(model_path), real_path) == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence piece model path is invalid.");
|
||||
}
|
||||
#endif
|
||||
std::string abs_path = real_path;
|
||||
file_path_ = (Path(abs_path) / Path(filename)).toString();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,65 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef DATASET_SENTENCE_PIECE_TOKENIZER_OP_H
|
||||
#define DATASET_SENTENCE_PIECE_TOKENIZER_OP_H
|
||||
|
||||
#include <sentencepiece_processor.h>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
|
||||
enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };
|
||||
|
||||
class SentencePieceTokenizerOp : public TensorOp {
|
||||
public:
|
||||
SentencePieceTokenizerOp(const std::shared_ptr<SentencePieceVocab> vocab, SPieceTokenizerLoadType load_type,
|
||||
const SPieceTokenizerOutType out_type);
|
||||
|
||||
SentencePieceTokenizerOp(const std::string &model_path, const std::string &model_filename,
|
||||
const SPieceTokenizerLoadType load_type, const SPieceTokenizerOutType out_type);
|
||||
|
||||
~SentencePieceTokenizerOp() override = default;
|
||||
|
||||
Status GetModelRealPath(const std::string &model_path, const std::string &filename);
|
||||
|
||||
void Print(std::ostream &out) const override {
|
||||
out << "SentencePieceTokenizerOp out_type = " << out_type_ << " load_type = " << load_type_;
|
||||
}
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
std::string Name() const override { return kSentencepieceTokenizerOp; }
|
||||
|
||||
protected:
|
||||
SPieceTokenizerOutType out_type_;
|
||||
std::shared_ptr<SentencePieceVocab> vocab_;
|
||||
std::string file_path_;
|
||||
SPieceTokenizerLoadType load_type_;
|
||||
sentencepiece::SentencePieceProcessor processor_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // DATASET_SENTENCE_PIECE_TOKENIZER_OP_H
|
@ -0,0 +1,112 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
|
||||
#include <sentencepiece_trainer.h>
|
||||
#include <sentencepiece_processor.h>
|
||||
#include <fstream>
|
||||
|
||||
#include "common/utils.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
SentencePieceVocab::SentencePieceVocab() : model_proto_("") {}
|
||||
|
||||
Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
|
||||
const float character_coverage, const SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms,
|
||||
std::shared_ptr<SentencePieceVocab> *vocab) {
|
||||
std::unordered_map<std::string, std::string> unorder_map;
|
||||
|
||||
// the input of sentence is comma separated string
|
||||
std::string input_str = "";
|
||||
for (auto path : path_list) {
|
||||
input_str += path;
|
||||
input_str += ",";
|
||||
}
|
||||
input_str.pop_back();
|
||||
unorder_map["input"] = input_str;
|
||||
unorder_map["vocab_size"] = std::to_string(vocab_size);
|
||||
unorder_map["model_prefix"] = "";
|
||||
unorder_map["minloglevel"] = "1";
|
||||
unorder_map["character_coverage"] = std::to_string(character_coverage);
|
||||
if (model_type == SentencePieceModel::kWord) {
|
||||
unorder_map["model_type"] = "WORD";
|
||||
} else if (model_type == SentencePieceModel::kBpe) {
|
||||
unorder_map["model_type"] = "BPE";
|
||||
} else if (model_type == SentencePieceModel::kChar) {
|
||||
unorder_map["model_type"] = "CHAR";
|
||||
} else {
|
||||
unorder_map["model_type"] = "UNIGRAM";
|
||||
}
|
||||
|
||||
// filter some params that set by function param
|
||||
// filter model_prefix that must be empty
|
||||
for (auto param : params) {
|
||||
std::string key = param.first;
|
||||
if (key == "input" || key == "vocab_size" || key == "model_prefix" || key == "character_coverage" ||
|
||||
key == "model_type") {
|
||||
continue;
|
||||
}
|
||||
unorder_map[key] = param.second;
|
||||
}
|
||||
|
||||
// set sentence lib's log
|
||||
unorder_map["minloglevel"] = "1";
|
||||
*vocab = std::make_shared<SentencePieceVocab>();
|
||||
std::string model_proto;
|
||||
sentencepiece::util::Status s_status = sentencepiece::SentencePieceTrainer::Train(unorder_map, nullptr, &model_proto);
|
||||
if (!s_status.ok()) {
|
||||
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, s_status.message());
|
||||
}
|
||||
vocab->get()->set_model_proto(model_proto);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SentencePieceVocab::SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path,
|
||||
std::string filename) {
|
||||
char real_path[PATH_MAX] = {0};
|
||||
|
||||
if (path.size() >= PATH_MAX) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence model path is invalid.");
|
||||
}
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (_fullpath(real_path, common::SafeCStr(path), PATH_MAX) == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence model path is invalid.");
|
||||
}
|
||||
#else
|
||||
if (realpath(common::SafeCStr(path), real_path) == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("sentence model path is invalid.");
|
||||
}
|
||||
#endif
|
||||
|
||||
std::string abs_real_path = (Path(real_path) / Path(filename)).toString();
|
||||
std::ofstream os_file(abs_real_path, std::ios::out);
|
||||
(void)os_file.write(vocab->get()->model_proto().data(), vocab->get()->model_proto().size());
|
||||
os_file.close();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
const std::string &SentencePieceVocab::model_proto() { return model_proto_; }
|
||||
|
||||
void SentencePieceVocab::set_model_proto(const std::string model_proto) { model_proto_ = model_proto; }
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
|
||||
#define DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
enum class SentencePieceModel { kUnigram = 0, kBpe = 1, kChar = 2, kWord = 3 };
|
||||
class SentencePieceVocab {
|
||||
public:
|
||||
static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
|
||||
const float character_coverage, const SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms,
|
||||
std::shared_ptr<SentencePieceVocab> *vocab);
|
||||
static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
|
||||
SentencePieceVocab();
|
||||
|
||||
~SentencePieceVocab() = default;
|
||||
|
||||
const std::string &model_proto();
|
||||
|
||||
void set_model_proto(const std::string model_proto);
|
||||
|
||||
private:
|
||||
std::string model_proto_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue