Clean up work for text python package

5 years ago · 6c21e556c4
parent df361d1d26
commit 6c21e556c4
33 changed files with 273 additions and 246 deletions
--- a/mindspore/ccsrc/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/CMakeLists.txt
@ -52,7 +52,7 @@ add_subdirectory(core)
 add_subdirectory(kernels)
 add_subdirectory(engine)
 add_subdirectory(api)
-add_subdirectory(nlp)
+add_subdirectory(text)
 ######################################################################
 ################### Create _c_dataengine Library ######################
@ -62,15 +62,14 @@ set(submodules
    $<TARGET_OBJECTS:kernels>
    $<TARGET_OBJECTS:kernels-image>
    $<TARGET_OBJECTS:kernels-data>
    $<TARGET_OBJECTS:kernels-text>
    $<TARGET_OBJECTS:APItoPython>
    $<TARGET_OBJECTS:engine-datasetops-source>
    $<TARGET_OBJECTS:engine-datasetops-source-sampler>
    $<TARGET_OBJECTS:engine-datasetops>
    $<TARGET_OBJECTS:engine-opt>
    $<TARGET_OBJECTS:engine>
-    $<TARGET_OBJECTS:nlp>
+    $<TARGET_OBJECTS:text>
-    $<TARGET_OBJECTS:nlp-kernels>
+    $<TARGET_OBJECTS:text-kernels>
    )
 if (ENABLE_TDTQUE)
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@ -38,10 +38,6 @@
 #include "dataset/kernels/image/resize_op.h"
 #include "dataset/kernels/image/uniform_aug_op.h"
 #include "dataset/kernels/data/type_cast_op.h"
 #include "dataset/kernels/text/jieba_tokenizer_op.h"
 #include "dataset/kernels/text/unicode_char_tokenizer_op.h"
 #include "dataset/nlp/vocab.h"
 #include "dataset/nlp/kernels/lookup_op.h"
 #include "dataset/engine/datasetops/source/cifar_op.h"
 #include "dataset/engine/datasetops/source/image_folder_op.h"
 #include "dataset/engine/datasetops/source/io_block.h"
@ -62,6 +58,10 @@
 #include "dataset/engine/datasetops/source/text_file_op.h"
 #include "dataset/engine/datasetops/source/voc_op.h"
 #include "dataset/kernels/data/to_float16_op.h"
 #include "dataset/text/kernels/jieba_tokenizer_op.h"
 #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
 #include "dataset/text/vocab.h"
 #include "dataset/text/kernels/lookup_op.h"
 #include "dataset/util/random.h"
 #include "mindrecord/include/shard_operator.h"
 #include "mindrecord/include/shard_pk_sample.h"
@ -549,9 +549,9 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("TEXTFILE", OpName::kTextFile);
  (void)py::enum_<JiebaMode>(m, "JiebaMode", py::arithmetic())
-    .value("DE_INTER_JIEBA_MIX", JiebaMode::kMix)
+    .value("DE_JIEBA_MIX", JiebaMode::kMix)
-    .value("DE_INTER_JIEBA_MP", JiebaMode::kMp)
+    .value("DE_JIEBA_MP", JiebaMode::kMp)
-    .value("DE_INTER_JIEBA_HMM", JiebaMode::kHmm)
+    .value("DE_JIEBA_HMM", JiebaMode::kHmm)
    .export_values();
  (void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())
--- a/mindspore/ccsrc/dataset/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/kernels/CMakeLists.txt
@ -2,7 +2,6 @@ add_subdirectory(image)
 add_subdirectory(data)
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_subdirectory(text)
 add_library(kernels OBJECT
    py_func_op.cc
    tensor_op.cc)
--- a/mindspore/ccsrc/dataset/nlp/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/nlp/CMakeLists.txt
@ -1,7 +0,0 @@
 add_subdirectory(kernels)
 add_library(nlp OBJECT
        vocab.cc
        )
 add_dependencies(nlp nlp-kernels)
--- a/mindspore/ccsrc/dataset/nlp/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/nlp/kernels/CMakeLists.txt
@ -1,3 +0,0 @@
 add_library(nlp-kernels OBJECT
        lookup_op.cc
        )
--- a/mindspore/ccsrc/dataset/text/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/text/CMakeLists.txt
@ -0,0 +1,7 @@
 add_subdirectory(kernels)
 add_library(text OBJECT
        vocab.cc
        )
 add_dependencies(text text-kernels)
--- a/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
@ -1,6 +1,7 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
-add_library(kernels-text OBJECT
+add_library(text-kernels OBJECT
        lookup_op.cc
        jieba_tokenizer_op.cc
        unicode_char_tokenizer_op.cc
        )
--- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "dataset/kernels/text/jieba_tokenizer_op.h"
+#include "dataset/text/kernels/jieba_tokenizer_op.h"
 #include <vector>
 #include <memory>
--- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h
--- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "dataset/nlp/kernels/lookup_op.h"
+#include "dataset/text/kernels/lookup_op.h"
 #include <string>
--- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.h
@ -24,7 +24,7 @@
 #include "dataset/core/tensor.h"
 #include "dataset/kernels/tensor_op.h"
 #include "dataset/util/status.h"
-#include "dataset/nlp/vocab.h"
+#include "dataset/text/vocab.h"
 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "dataset/kernels/text/unicode_char_tokenizer_op.h"
+#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
 #include <memory>
 #include <string>
 #include <string_view>
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
--- a/mindspore/ccsrc/dataset/text/vocab.cc
+++ b/mindspore/ccsrc/dataset/text/vocab.cc
@ -17,7 +17,7 @@
 #include <map>
 #include <utility>
-#include "dataset/nlp/vocab.h"
+#include "dataset/text/vocab.h"
 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/dataset/text/vocab.h
+++ b/mindspore/ccsrc/dataset/text/vocab.h
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -284,10 +284,10 @@ class Dataset:
        Examples:
            >>> import mindspore.dataset as ds
-            >>> import mindspore.dataset.transforms.text.utils as text
+            >>> import mindspore.dataset.text as text
            >>> # declare a function which returns a Dataset object
            >>> def flat_map_func(x):
-            >>>     data_dir = text.as_text(x[0])
+            >>>     data_dir = text.to_str(x[0])
            >>>     d = ds.ImageFolderDatasetV2(data_dir)
            >>>     return d
            >>> # data is a Dataset object
--- a/mindspore/dataset/text/init.py
+++ b/mindspore/dataset/text/init.py
@ -15,5 +15,5 @@
 """
 mindspore.dataset.text
 """
-
+from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer
-from .c_transforms import *
+from .utils import to_str, to_bytes, JiebaMode, Vocab
--- a/mindspore/dataset/transforms/text/c_transforms.py
+++ b/mindspore/dataset/transforms/text/c_transforms.py
@ -11,20 +11,40 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """
-This module c_transforms provides common nlp operations.
+c transforms for all text related operators
 """
 import os
 import re
 import mindspore._c_dataengine as cde
 from .utils import JiebaMode
-from .validators import check_jieba_add_dict, check_jieba_add_word, check_jieba_init
+from .validators import check_lookup, check_jieba_add_dict, \
    check_jieba_add_word, check_jieba_init
 class Lookup(cde.LookupOp):
    """
        Lookup operator that looks up a word to an id
    Args:
        vocab(Vocab): a Vocab object
        unknown(None,int): default id to lookup a word that is out of vocab
    """
    @check_lookup
    def __init__(self, vocab, unknown=None):
        if unknown is None:
            super().__init__(vocab)
        else:
            super().__init__(vocab, unknown)
 DE_C_INTER_JIEBA_MODE = {
-    JiebaMode.MIX: cde.JiebaMode.DE_INTER_JIEBA_MIX,
+    JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX,
-    JiebaMode.MP: cde.JiebaMode.DE_INTER_JIEBA_MP,
+    JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP,
-    JiebaMode.HMM: cde.JiebaMode.DE_INTER_JIEBA_HMM
+    JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM
 }
@ -41,6 +61,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
            "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
            "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm.
    """
    @check_jieba_init
    def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX):
        self.mode = mode
--- a/mindspore/dataset/text/c_transforms.py
+++ b/mindspore/dataset/text/c_transforms.py
@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-c transforms for all text related operators
+Some basic function for nlp
 """
 from enum import IntEnum
 import mindspore._c_dataengine as cde
-from .validators import check_lookup, check_from_list, check_from_dict, check_from_file
+import numpy as np
 from .validators import check_from_file, check_from_list, check_from_dict
 class Vocab(cde.Vocab):
@ -61,17 +64,43 @@ class Vocab(cde.Vocab):
        return super().from_dict(word_dict)
-class Lookup(cde.LookupOp):
+def to_str(array, encoding='utf8'):
    """
-        Lookup operator that looks up a word to an id
+    Convert numpy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
    Args:
-        vocab(Vocab): a Vocab object
+        array (numpy array): Array of type `bytes` representing strings.
-        unknown(None,int): default id to lookup a word that is out of vocab
+        encoding (string): Indicating the charset for decoding.
    Returns:
        Numpy array of `str`.
    """
    if not isinstance(array, np.ndarray):
        raise ValueError('input should be a numpy array')
    return np.char.decode(array, encoding)
 def to_bytes(array, encoding='utf8'):
    """
    Convert numpy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
    Args:
        array (numpy array): Array of type `str` representing strings.
        encoding (string): Indicating the charset for encoding.
    Returns:
        Numpy array of `bytes`.
    """
    if not isinstance(array, np.ndarray):
        raise ValueError('input should be a numpy array')
    return np.char.encode(array, encoding)
-    @check_lookup
+class JiebaMode(IntEnum):
-    def __init__(self, vocab, unknown=None):
+    MIX = 0
-        if unknown is None:
+    MP = 1
-            super().__init__(vocab)
+    HMM = 2
        else:
            super().__init__(vocab, unknown)
--- a/mindspore/dataset/text/validators.py
+++ b/mindspore/dataset/text/validators.py
@ -17,8 +17,11 @@ validators for text ops
 """
 from functools import wraps
 import mindspore._c_dataengine as cde
 from ..transforms.validators import check_uint32
 def check_lookup(method):
    """A wrapper that wrap a parameter checker to the original function(crop operation)."""
@ -106,3 +109,67 @@ def check_from_dict(method):
        return method(self, **kwargs)
    return new_method
 def check_jieba_init(method):
    """Wrapper method to check the parameters of jieba add word."""
    @wraps(method)
    def new_method(self, *args, **kwargs):
        hmm_path, mp_path, model = (list(args) + 3 * [None])[:3]
        if "hmm_path" in kwargs:
            hmm_path = kwargs.get("hmm_path")
        if "mp_path" in kwargs:
            mp_path = kwargs.get("mp_path")
        if hmm_path is None:
            raise ValueError(
                "the dict of HMMSegment in cppjieba is not provided")
        kwargs["hmm_path"] = hmm_path
        if mp_path is None:
            raise ValueError(
                "the dict of MPSegment in cppjieba is not provided")
        kwargs["mp_path"] = mp_path
        if model is not None:
            kwargs["model"] = model
        return method(self, **kwargs)
    return new_method
 def check_jieba_add_word(method):
    """Wrapper method to check the parameters of jieba add word."""
    @wraps(method)
    def new_method(self, *args, **kwargs):
        word, freq = (list(args) + 2 * [None])[:2]
        if "word" in kwargs:
            word = kwargs.get("word")
        if "freq" in kwargs:
            freq = kwargs.get("freq")
        if word is None:
            raise ValueError("word is not provided")
        kwargs["word"] = word
        if freq is not None:
            check_uint32(freq)
            kwargs["freq"] = freq
        return method(self, **kwargs)
    return new_method
 def check_jieba_add_dict(method):
    """Wrapper method to check the parameters of add dict"""
    @wraps(method)
    def new_method(self, *args, **kwargs):
        user_dict = (list(args) + [None])[0]
        if "user_dict" in kwargs:
            user_dict = kwargs.get("user_dict")
        if user_dict is None:
            raise ValueError("user_dict is not provided")
        kwargs["user_dict"] = user_dict
        return method(self, **kwargs)
    return new_method
--- a/mindspore/dataset/transforms/text/init.py
+++ b/mindspore/dataset/transforms/text/init.py
@ -1,21 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This module is to support nlp augmentations. It includes two parts:
 c_transforms and py_transforms. C_transforms is a high performance
 image augmentation module which is developed with c++ opencv. Py_transforms
 provide more kinds of image augmentations which is developed with python PIL.
 """
 from .utils import as_text, JiebaMode
 from . import c_transforms
--- a/mindspore/dataset/transforms/text/utils.py
+++ b/mindspore/dataset/transforms/text/utils.py
@ -1,43 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Some basic function for nlp
 """
 from enum import IntEnum
 import numpy as np
 def as_text(array, encoding='utf8'):
    """
    Convert data of array to unicode.
    Args:
        array (numpy array): Data of array should be ASCII values of each character after converted.
        encoding (string): Indicating the charset for decoding.
    Returns:
        A 'str' object.
    """
    if not isinstance(array, np.ndarray):
        raise ValueError('input should be a numpy array')
    decode = np.vectorize(lambda x: x.decode(encoding))
    return decode(array)
 class JiebaMode(IntEnum):
    MIX = 0
    MP = 1
    HMM = 2
--- a/mindspore/dataset/transforms/text/validators.py
+++ b/mindspore/dataset/transforms/text/validators.py
@ -1,79 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Validators for TensorOps.
 """
 from functools import wraps
 from ...transforms.validators import check_uint32
 def check_jieba_init(method):
    """Wrapper method to check the parameters of jieba add word."""
    @wraps(method)
    def new_method(self, *args, **kwargs):
        hmm_path, mp_path, model = (list(args) + 3 * [None])[:3]
        if "hmm_path" in kwargs:
            hmm_path = kwargs.get("hmm_path")
        if "mp_path" in kwargs:
            mp_path = kwargs.get("mp_path")
        if hmm_path is None:
            raise ValueError(
                "the dict of HMMSegment in cppjieba is not provided")
        kwargs["hmm_path"] = hmm_path
        if mp_path is None:
            raise ValueError(
                "the dict of MPSegment in cppjieba is not provided")
        kwargs["mp_path"] = mp_path
        if model is not None:
            kwargs["model"] = model
        return method(self, **kwargs)
    return new_method
 def check_jieba_add_word(method):
    """Wrapper method to check the parameters of jieba add word."""
    @wraps(method)
    def new_method(self, *args, **kwargs):
        word, freq = (list(args) + 2 * [None])[:2]
        if "word" in kwargs:
            word = kwargs.get("word")
        if "freq" in kwargs:
            freq = kwargs.get("freq")
        if word is None:
            raise ValueError("word is not provided")
        kwargs["word"] = word
        if freq is not None:
            check_uint32(freq)
            kwargs["freq"] = freq
        return method(self, **kwargs)
    return new_method
 def check_jieba_add_dict(method):
    """Wrapper method to check the parameters of add dict"""
    @wraps(method)
    def new_method(self, *args, **kwargs):
        user_dict = (list(args) + [None])[0]
        if "user_dict" in kwargs:
            user_dict = kwargs.get("user_dict")
        if user_dict is None:
            raise ValueError("user_dict is not provided")
        kwargs["user_dict"] = user_dict
        return method(self, **kwargs)
    return new_method
--- a/tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc
+++ b/tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc
@ -18,7 +18,7 @@
 #include <string_view>
 #include "common/common.h"
-#include "dataset/kernels/text/jieba_tokenizer_op.h"
+#include "dataset/text/kernels/jieba_tokenizer_op.h"
 #include "gtest/gtest.h"
 #include "utils/log_adapter.h"
--- a/tests/ut/cpp/dataset/tokenizer_op_test.cc
+++ b/tests/ut/cpp/dataset/tokenizer_op_test.cc
@ -18,7 +18,7 @@
 #include <string_view>
 #include "common/common.h"
-#include "dataset/kernels/text/unicode_char_tokenizer_op.h"
+#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
 #include "gtest/gtest.h"
 #include "utils/log_adapter.h"
--- a/Show More
+++ b/Show More