Clean up work for text python package

pull/1365/head
hesham 5 years ago
parent df361d1d26
commit 6c21e556c4

@ -52,7 +52,7 @@ add_subdirectory(core)
add_subdirectory(kernels) add_subdirectory(kernels)
add_subdirectory(engine) add_subdirectory(engine)
add_subdirectory(api) add_subdirectory(api)
add_subdirectory(nlp) add_subdirectory(text)
###################################################################### ######################################################################
################### Create _c_dataengine Library ###################### ################### Create _c_dataengine Library ######################
@ -62,15 +62,14 @@ set(submodules
$<TARGET_OBJECTS:kernels> $<TARGET_OBJECTS:kernels>
$<TARGET_OBJECTS:kernels-image> $<TARGET_OBJECTS:kernels-image>
$<TARGET_OBJECTS:kernels-data> $<TARGET_OBJECTS:kernels-data>
$<TARGET_OBJECTS:kernels-text>
$<TARGET_OBJECTS:APItoPython> $<TARGET_OBJECTS:APItoPython>
$<TARGET_OBJECTS:engine-datasetops-source> $<TARGET_OBJECTS:engine-datasetops-source>
$<TARGET_OBJECTS:engine-datasetops-source-sampler> $<TARGET_OBJECTS:engine-datasetops-source-sampler>
$<TARGET_OBJECTS:engine-datasetops> $<TARGET_OBJECTS:engine-datasetops>
$<TARGET_OBJECTS:engine-opt> $<TARGET_OBJECTS:engine-opt>
$<TARGET_OBJECTS:engine> $<TARGET_OBJECTS:engine>
$<TARGET_OBJECTS:nlp> $<TARGET_OBJECTS:text>
$<TARGET_OBJECTS:nlp-kernels> $<TARGET_OBJECTS:text-kernels>
) )
if (ENABLE_TDTQUE) if (ENABLE_TDTQUE)

@ -38,10 +38,6 @@
#include "dataset/kernels/image/resize_op.h" #include "dataset/kernels/image/resize_op.h"
#include "dataset/kernels/image/uniform_aug_op.h" #include "dataset/kernels/image/uniform_aug_op.h"
#include "dataset/kernels/data/type_cast_op.h" #include "dataset/kernels/data/type_cast_op.h"
#include "dataset/kernels/text/jieba_tokenizer_op.h"
#include "dataset/kernels/text/unicode_char_tokenizer_op.h"
#include "dataset/nlp/vocab.h"
#include "dataset/nlp/kernels/lookup_op.h"
#include "dataset/engine/datasetops/source/cifar_op.h" #include "dataset/engine/datasetops/source/cifar_op.h"
#include "dataset/engine/datasetops/source/image_folder_op.h" #include "dataset/engine/datasetops/source/image_folder_op.h"
#include "dataset/engine/datasetops/source/io_block.h" #include "dataset/engine/datasetops/source/io_block.h"
@ -62,6 +58,10 @@
#include "dataset/engine/datasetops/source/text_file_op.h" #include "dataset/engine/datasetops/source/text_file_op.h"
#include "dataset/engine/datasetops/source/voc_op.h" #include "dataset/engine/datasetops/source/voc_op.h"
#include "dataset/kernels/data/to_float16_op.h" #include "dataset/kernels/data/to_float16_op.h"
#include "dataset/text/kernels/jieba_tokenizer_op.h"
#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "dataset/text/vocab.h"
#include "dataset/text/kernels/lookup_op.h"
#include "dataset/util/random.h" #include "dataset/util/random.h"
#include "mindrecord/include/shard_operator.h" #include "mindrecord/include/shard_operator.h"
#include "mindrecord/include/shard_pk_sample.h" #include "mindrecord/include/shard_pk_sample.h"
@ -549,9 +549,9 @@ PYBIND11_MODULE(_c_dataengine, m) {
.value("TEXTFILE", OpName::kTextFile); .value("TEXTFILE", OpName::kTextFile);
(void)py::enum_<JiebaMode>(m, "JiebaMode", py::arithmetic()) (void)py::enum_<JiebaMode>(m, "JiebaMode", py::arithmetic())
.value("DE_INTER_JIEBA_MIX", JiebaMode::kMix) .value("DE_JIEBA_MIX", JiebaMode::kMix)
.value("DE_INTER_JIEBA_MP", JiebaMode::kMp) .value("DE_JIEBA_MP", JiebaMode::kMp)
.value("DE_INTER_JIEBA_HMM", JiebaMode::kHmm) .value("DE_JIEBA_HMM", JiebaMode::kHmm)
.export_values(); .export_values();
(void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic()) (void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())

@ -2,7 +2,6 @@ add_subdirectory(image)
add_subdirectory(data) add_subdirectory(data)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_subdirectory(text)
add_library(kernels OBJECT add_library(kernels OBJECT
py_func_op.cc py_func_op.cc
tensor_op.cc) tensor_op.cc)

@ -1,7 +0,0 @@
add_subdirectory(kernels)
add_library(nlp OBJECT
vocab.cc
)
add_dependencies(nlp nlp-kernels)

@ -1,3 +0,0 @@
add_library(nlp-kernels OBJECT
lookup_op.cc
)

@ -0,0 +1,7 @@
add_subdirectory(kernels)
add_library(text OBJECT
vocab.cc
)
add_dependencies(text text-kernels)

@ -1,6 +1,7 @@
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(kernels-text OBJECT add_library(text-kernels OBJECT
lookup_op.cc
jieba_tokenizer_op.cc jieba_tokenizer_op.cc
unicode_char_tokenizer_op.cc unicode_char_tokenizer_op.cc
) )

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include "dataset/kernels/text/jieba_tokenizer_op.h" #include "dataset/text/kernels/jieba_tokenizer_op.h"
#include <vector> #include <vector>
#include <memory> #include <memory>

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include "dataset/nlp/kernels/lookup_op.h" #include "dataset/text/kernels/lookup_op.h"
#include <string> #include <string>

@ -24,7 +24,7 @@
#include "dataset/core/tensor.h" #include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h" #include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h" #include "dataset/util/status.h"
#include "dataset/nlp/vocab.h" #include "dataset/text/vocab.h"
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include "dataset/kernels/text/unicode_char_tokenizer_op.h" #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <string_view> #include <string_view>

@ -17,7 +17,7 @@
#include <map> #include <map>
#include <utility> #include <utility>
#include "dataset/nlp/vocab.h" #include "dataset/text/vocab.h"
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {

@ -284,10 +284,10 @@ class Dataset:
Examples: Examples:
>>> import mindspore.dataset as ds >>> import mindspore.dataset as ds
>>> import mindspore.dataset.transforms.text.utils as text >>> import mindspore.dataset.text as text
>>> # declare a function which returns a Dataset object >>> # declare a function which returns a Dataset object
>>> def flat_map_func(x): >>> def flat_map_func(x):
>>> data_dir = text.as_text(x[0]) >>> data_dir = text.to_str(x[0])
>>> d = ds.ImageFolderDatasetV2(data_dir) >>> d = ds.ImageFolderDatasetV2(data_dir)
>>> return d >>> return d
>>> # data is a Dataset object >>> # data is a Dataset object

@ -15,5 +15,5 @@
""" """
mindspore.dataset.text mindspore.dataset.text
""" """
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer
from .c_transforms import * from .utils import to_str, to_bytes, JiebaMode, Vocab

@ -11,20 +11,40 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ==============================================================================
""" """
This module c_transforms provides common nlp operations. c transforms for all text related operators
""" """
import os import os
import re import re
import mindspore._c_dataengine as cde import mindspore._c_dataengine as cde
from .utils import JiebaMode from .utils import JiebaMode
from .validators import check_jieba_add_dict, check_jieba_add_word, check_jieba_init from .validators import check_lookup, check_jieba_add_dict, \
check_jieba_add_word, check_jieba_init
class Lookup(cde.LookupOp):
"""
Lookup operator that looks up a word to an id
Args:
vocab(Vocab): a Vocab object
unknown(None,int): default id to lookup a word that is out of vocab
"""
@check_lookup
def __init__(self, vocab, unknown=None):
if unknown is None:
super().__init__(vocab)
else:
super().__init__(vocab, unknown)
DE_C_INTER_JIEBA_MODE = { DE_C_INTER_JIEBA_MODE = {
JiebaMode.MIX: cde.JiebaMode.DE_INTER_JIEBA_MIX, JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX,
JiebaMode.MP: cde.JiebaMode.DE_INTER_JIEBA_MP, JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP,
JiebaMode.HMM: cde.JiebaMode.DE_INTER_JIEBA_HMM JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM
} }
@ -41,6 +61,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
"HMM" mode will tokenize with Hiddel Markov Model Segment algorithm, "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
"MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm. "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm.
""" """
@check_jieba_init @check_jieba_init
def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX): def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX):
self.mode = mode self.mode = mode

@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
c transforms for all text related operators Some basic function for nlp
""" """
from enum import IntEnum
import mindspore._c_dataengine as cde import mindspore._c_dataengine as cde
from .validators import check_lookup, check_from_list, check_from_dict, check_from_file import numpy as np
from .validators import check_from_file, check_from_list, check_from_dict
class Vocab(cde.Vocab): class Vocab(cde.Vocab):
@ -61,17 +64,43 @@ class Vocab(cde.Vocab):
return super().from_dict(word_dict) return super().from_dict(word_dict)
class Lookup(cde.LookupOp): def to_str(array, encoding='utf8'):
""" """
Lookup operator that looks up a word to an id Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
Args: Args:
vocab(Vocab): a Vocab object array (numpy array): Array of type `bytes` representing strings.
unknown(None,int): default id to lookup a word that is out of vocab encoding (string): Indicating the charset for decoding.
Returns:
Numpy array of `str`.
"""
if not isinstance(array, np.ndarray):
raise ValueError('input should be a numpy array')
return np.char.decode(array, encoding)
def to_bytes(array, encoding='utf8'):
""" """
Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
Args:
array (numpy array): Array of type `str` representing strings.
encoding (string): Indicating the charset for encoding.
Returns:
Numpy array of `bytes`.
"""
if not isinstance(array, np.ndarray):
raise ValueError('input should be a numpy array')
return np.char.encode(array, encoding)
@check_lookup class JiebaMode(IntEnum):
def __init__(self, vocab, unknown=None): MIX = 0
if unknown is None: MP = 1
super().__init__(vocab) HMM = 2
else:
super().__init__(vocab, unknown)

@ -17,8 +17,11 @@ validators for text ops
""" """
from functools import wraps from functools import wraps
import mindspore._c_dataengine as cde import mindspore._c_dataengine as cde
from ..transforms.validators import check_uint32
def check_lookup(method): def check_lookup(method):
"""A wrapper that wrap a parameter checker to the original function(crop operation).""" """A wrapper that wrap a parameter checker to the original function(crop operation)."""
@ -106,3 +109,67 @@ def check_from_dict(method):
return method(self, **kwargs) return method(self, **kwargs)
return new_method return new_method
def check_jieba_init(method):
"""Wrapper method to check the parameters of jieba add word."""
@wraps(method)
def new_method(self, *args, **kwargs):
hmm_path, mp_path, model = (list(args) + 3 * [None])[:3]
if "hmm_path" in kwargs:
hmm_path = kwargs.get("hmm_path")
if "mp_path" in kwargs:
mp_path = kwargs.get("mp_path")
if hmm_path is None:
raise ValueError(
"the dict of HMMSegment in cppjieba is not provided")
kwargs["hmm_path"] = hmm_path
if mp_path is None:
raise ValueError(
"the dict of MPSegment in cppjieba is not provided")
kwargs["mp_path"] = mp_path
if model is not None:
kwargs["model"] = model
return method(self, **kwargs)
return new_method
def check_jieba_add_word(method):
"""Wrapper method to check the parameters of jieba add word."""
@wraps(method)
def new_method(self, *args, **kwargs):
word, freq = (list(args) + 2 * [None])[:2]
if "word" in kwargs:
word = kwargs.get("word")
if "freq" in kwargs:
freq = kwargs.get("freq")
if word is None:
raise ValueError("word is not provided")
kwargs["word"] = word
if freq is not None:
check_uint32(freq)
kwargs["freq"] = freq
return method(self, **kwargs)
return new_method
def check_jieba_add_dict(method):
"""Wrapper method to check the parameters of add dict"""
@wraps(method)
def new_method(self, *args, **kwargs):
user_dict = (list(args) + [None])[0]
if "user_dict" in kwargs:
user_dict = kwargs.get("user_dict")
if user_dict is None:
raise ValueError("user_dict is not provided")
kwargs["user_dict"] = user_dict
return method(self, **kwargs)
return new_method

@ -1,21 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module is to support nlp augmentations. It includes two parts:
c_transforms and py_transforms. C_transforms is a high performance
image augmentation module which is developed with c++ opencv. Py_transforms
provide more kinds of image augmentations which is developed with python PIL.
"""
from .utils import as_text, JiebaMode
from . import c_transforms

@ -1,43 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Some basic function for nlp
"""
from enum import IntEnum
import numpy as np
def as_text(array, encoding='utf8'):
"""
Convert data of array to unicode.
Args:
array (numpy array): Data of array should be ASCII values of each character after converted.
encoding (string): Indicating the charset for decoding.
Returns:
A 'str' object.
"""
if not isinstance(array, np.ndarray):
raise ValueError('input should be a numpy array')
decode = np.vectorize(lambda x: x.decode(encoding))
return decode(array)
class JiebaMode(IntEnum):
MIX = 0
MP = 1
HMM = 2

@ -1,79 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Validators for TensorOps.
"""
from functools import wraps
from ...transforms.validators import check_uint32
def check_jieba_init(method):
"""Wrapper method to check the parameters of jieba add word."""
@wraps(method)
def new_method(self, *args, **kwargs):
hmm_path, mp_path, model = (list(args) + 3 * [None])[:3]
if "hmm_path" in kwargs:
hmm_path = kwargs.get("hmm_path")
if "mp_path" in kwargs:
mp_path = kwargs.get("mp_path")
if hmm_path is None:
raise ValueError(
"the dict of HMMSegment in cppjieba is not provided")
kwargs["hmm_path"] = hmm_path
if mp_path is None:
raise ValueError(
"the dict of MPSegment in cppjieba is not provided")
kwargs["mp_path"] = mp_path
if model is not None:
kwargs["model"] = model
return method(self, **kwargs)
return new_method
def check_jieba_add_word(method):
"""Wrapper method to check the parameters of jieba add word."""
@wraps(method)
def new_method(self, *args, **kwargs):
word, freq = (list(args) + 2 * [None])[:2]
if "word" in kwargs:
word = kwargs.get("word")
if "freq" in kwargs:
freq = kwargs.get("freq")
if word is None:
raise ValueError("word is not provided")
kwargs["word"] = word
if freq is not None:
check_uint32(freq)
kwargs["freq"] = freq
return method(self, **kwargs)
return new_method
def check_jieba_add_dict(method):
"""Wrapper method to check the parameters of add dict"""
@wraps(method)
def new_method(self, *args, **kwargs):
user_dict = (list(args) + [None])[0]
if "user_dict" in kwargs:
user_dict = kwargs.get("user_dict")
if user_dict is None:
raise ValueError("user_dict is not provided")
kwargs["user_dict"] = user_dict
return method(self, **kwargs)
return new_method

@ -18,7 +18,7 @@
#include <string_view> #include <string_view>
#include "common/common.h" #include "common/common.h"
#include "dataset/kernels/text/jieba_tokenizer_op.h" #include "dataset/text/kernels/jieba_tokenizer_op.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "utils/log_adapter.h" #include "utils/log_adapter.h"

@ -18,7 +18,7 @@
#include <string_view> #include <string_view>
#include "common/common.h" #include "common/common.h"
#include "dataset/kernels/text/unicode_char_tokenizer_op.h" #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "utils/log_adapter.h" #include "utils/log_adapter.h"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save