diff --git a/mindspore/ccsrc/dataset/api/python_bindings.cc b/mindspore/ccsrc/dataset/api/python_bindings.cc index 8a8e88be57..5391ad7cb3 100644 --- a/mindspore/ccsrc/dataset/api/python_bindings.cc +++ b/mindspore/ccsrc/dataset/api/python_bindings.cc @@ -40,6 +40,7 @@ #include "dataset/kernels/image/resize_op.h" #include "dataset/kernels/image/uniform_aug_op.h" #include "dataset/kernels/image/bounding_box_augment_op.h" +#include "dataset/kernels/data/duplicate_op.h" #include "dataset/kernels/data/fill_op.h" #include "dataset/kernels/data/mask_op.h" #include "dataset/kernels/data/pad_end_op.h" @@ -443,6 +444,9 @@ void bindTensorOps2(py::module *m) { "Tensor mask operation using relational comparator") .def(py::init, DataType>()); + (void)py::class_>(*m, "DuplicateOp", "Duplicate tensor.") + .def(py::init<>()); + (void)py::class_>( *m, "TruncateSequencePairOp", "Tensor operation to truncate two tensors to a max_length") .def(py::init()); diff --git a/mindspore/ccsrc/dataset/core/tensor.h b/mindspore/ccsrc/dataset/core/tensor.h index 0aec84f77b..a3dbb391e5 100644 --- a/mindspore/ccsrc/dataset/core/tensor.h +++ b/mindspore/ccsrc/dataset/core/tensor.h @@ -115,6 +115,16 @@ class Tensor { static Status CreateTensor(std::shared_ptr *, TensorImpl tensor_impl, const TensorShape &shape, DataType type, const unsigned char *data = nullptr); + /// Create a copy of the input tensor + /// \param out [out] output tensor to be generated + /// \param in [in] orginal tensor to be copied + /// \return Status + static Status CreateTensor(std::shared_ptr *out, const std::shared_ptr &in) { + const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator(); + *out = std::allocate_shared(*alloc, in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes()); + return Status::OK(); + } + // A static factory method to create a Tensor from a given py::array. // @param ptr output argument to hold the created Tensor // @param arr py::array diff --git a/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt b/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt index 1df952f351..9131c9c667 100644 --- a/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt +++ b/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt @@ -10,4 +10,5 @@ add_library(kernels-data OBJECT slice_op.cc mask_op.cc concatenate_op.cc + duplicate_op.cc ) diff --git a/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc new file mode 100644 index 0000000000..959516a4aa --- /dev/null +++ b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc @@ -0,0 +1,35 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dataset/kernels/data/duplicate_op.h" + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" + +namespace mindspore { +namespace dataset { + +Status DuplicateOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); + std::shared_ptr out; + RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, input[0])); + output->push_back(input[0]); + output->push_back(out); + return Status::OK(); +} +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h new file mode 100644 index 0000000000..4c9d6d36c9 --- /dev/null +++ b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h @@ -0,0 +1,42 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DATASET_KERNELS_DATA_DUPLICATE_OP_H_ +#define DATASET_KERNELS_DATA_DUPLICATE_OP_H_ + +#include +#include + +#include "dataset/core/tensor.h" +#include "dataset/kernels/tensor_op.h" + +namespace mindspore { +namespace dataset { + +class DuplicateOp : public TensorOp { + public: + DuplicateOp() = default; + + ~DuplicateOp() override = default; + + void Print(std::ostream &out) const override { out << "DuplicateOp"; } + + Status Compute(const TensorRow &input, TensorRow *output) override; + + uint32_t NumOutput() override { return 2; } +}; +} // namespace dataset +} // namespace mindspore +#endif // DATASET_KERNELS_DUPLICATE_OP_H_ diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 4482f029cd..70e9b763f6 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -4869,10 +4869,10 @@ class BuildVocabDataset(DatasetOp): top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None, all words are included). - special_tokens(list): a list of strings, each one is a special token. for e.g. ["",""] - (default=None, no special tokens will be added). - special_first(bool): whether special_tokens will be prepended/appended to vocab, If special_tokens is - specified and special_first is set to None, special_tokens will be prepended. (default=None). + special_tokens(list, optional): a list of strings, each one is a special token. for example + special_tokens=["",""] (default=None, no special tokens will be added). + special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens + is specified and special_first is set to None, special_tokens will be prepended. (default=None). prefetch_size (int, optional): prefetch number of records ahead of the user's request (default=None). """ diff --git a/mindspore/dataset/text/utils.py b/mindspore/dataset/text/utils.py index c09b6d71d7..483e91bead 100644 --- a/mindspore/dataset/text/utils.py +++ b/mindspore/dataset/text/utils.py @@ -50,8 +50,8 @@ class Vocab(cde.Vocab): top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None all words are included). - special_tokens(list): a list of strings, each one is a special token. for e.g. ["",""] - (default=None, no special tokens will be added). + special_tokens(list, optional): a list of strings, each one is a special token. for example + special_tokens=["",""] (default=None, no special tokens will be added). special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens is specified and special_first is set to None, special_tokens will be prepended. (default=None). return: @@ -72,8 +72,8 @@ class Vocab(cde.Vocab): build a vocab object from a list of word. Args: word_list(list): a list of string where each element is a word of type string. - special_tokens(list): a list of strings, each one is a special token. for e.g. ["",""] - (default=None, no special tokens will be added). + special_tokens(list, optional): a list of strings, each one is a special token. for example + special_tokens=["",""] (default=None, no special tokens will be added). special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens is specified and special_first is set to None, special_tokens will be prepended. (default=None). """ @@ -89,8 +89,8 @@ class Vocab(cde.Vocab): delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be the word (default=None). vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken). - special_tokens(list): a list of strings, each one is a special token. for e.g. ["",""] - (default=None, no special tokens will be added). + special_tokens(list, optional): a list of strings, each one is a special token. for example + special_tokens=["",""] (default=None, no special tokens will be added). special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens is specified and special_first is set to None, special_tokens will be prepended. (default=None). """ diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index 903315ef0b..e69f9371c9 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -203,3 +203,22 @@ class Concatenate(cde.ConcatenateOp): def __init__(self, axis=0, prepend=None, append=None): # add some validations here later super().__init__(axis, prepend, append) + + +class Duplicate(cde.DuplicateOp): + """ + Duplicate the input tensor to a new output tensor. The input tensor is carried over to the output list. + Examples: + >>> # Data before + >>> # | x | + >>> # +---------+ + >>> # | [1,2,3] | + >>> # +---------+ + >>> data = data.map(input_columns=["x"], operations=Duplicate(), + >>> output_columns=["x", "y"], output_order=["x", "y"]) + >>> # Data after + >>> # | x | y | + >>> # +---------+---------+ + >>> # | [1,2,3] | [1,2,3] | + >>> # +---------+---------+ + """ diff --git a/tests/ut/cpp/dataset/duplicate_op_test.cc b/tests/ut/cpp/dataset/duplicate_op_test.cc new file mode 100644 index 0000000000..6c9c00a30e --- /dev/null +++ b/tests/ut/cpp/dataset/duplicate_op_test.cc @@ -0,0 +1,49 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "dataset/core/client.h" +#include "common/common.h" +#include "gtest/gtest.h" +#include "dataset/core/tensor.h" +#include "dataset/util/de_error.h" +#include "dataset/kernels/data/duplicate_op.h" + +using namespace mindspore::dataset; + +namespace py = pybind11; + +class MindDataTestDuplicateOp : public UT::Common { + public: + MindDataTestDuplicateOp() {} + + void SetUp() { GlobalInit(); } +}; + +TEST_F(MindDataTestDuplicateOp, Basics) { + std::shared_ptr t; + Tensor::CreateTensor(&t, std::vector({1, 2, 3, 4, 5, 6})); + std::shared_ptr v; + Tensor::CreateTensor(&v, std::vector({3}), TensorShape::CreateScalar()); + std::shared_ptr op = std::make_shared(); + TensorRow in; + in.push_back(t); + TensorRow out; + ASSERT_TRUE(op->Compute(in, &out).IsOk()); + + ASSERT_TRUE(*t == *out[0]); + ASSERT_TRUE(*t == *out[1]); + ASSERT_TRUE(t->GetBuffer() == out[0]->GetBuffer()); + ASSERT_TRUE(t->GetBuffer() != out[1]->GetBuffer()); +} diff --git a/tests/ut/python/dataset/test_duplicate_op.py b/tests/ut/python/dataset/test_duplicate_op.py new file mode 100644 index 0000000000..9de3453a7e --- /dev/null +++ b/tests/ut/python/dataset/test_duplicate_op.py @@ -0,0 +1,40 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Testing Duplicate op in DE +""" +import numpy as np + +import mindspore.dataset as ds +import mindspore.dataset.transforms.c_transforms as ops + + +def compare(array): + data = ds.NumpySlicesDataset([array], column_names="x") + array = np.array(array) + data = data.map(input_columns=["x"], output_columns=["x", "y"], columns_order=["x", "y"], + operations=ops.Duplicate()) + for d in data.create_dict_iterator(): + np.testing.assert_array_equal(array, d["x"]) + np.testing.assert_array_equal(array, d["y"]) + + +def test_duplicate_basics(): + compare([1, 2, 3]) + compare([b"1", b"2", b"3"]) + + +if __name__ == "__main__": + test_duplicate_basics() diff --git a/tests/ut/python/dataset/test_vocab.py b/tests/ut/python/dataset/test_vocab.py index 425a79e069..35411e5c80 100644 --- a/tests/ut/python/dataset/test_vocab.py +++ b/tests/ut/python/dataset/test_vocab.py @@ -1,4 +1,4 @@ -# Copyright 2019 Huawei Technologies Co., Ltd +# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -94,9 +94,10 @@ def test_from_file(): for word in texts.split(" "): yield (np.array(word, dtype='S'),) - def test_config(lookup_str, special_tokens, special_first): + def test_config(lookup_str, vocab_size, special_tokens, special_first): try: - vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, special_tokens=special_tokens, special_first=special_first) + vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, vocab_size=vocab_size, special_tokens=special_tokens, + special_first=special_first) data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"]) data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) res = [] @@ -106,9 +107,14 @@ def test_from_file(): except ValueError as e: return str(e) - assert test_config("w1 w2 w3", ["s1", "s2", "s3"], True) == [3, 4, 5] - assert test_config("w1 w2 w3", ["s1", "s2", "s3"], False) == [0, 1, 2] - assert "special_tokens contains duplicate" in test_config("w1", ["s1", "s1"], True) + # test special tokens are prepended + assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], True) == [3, 4, 5, 0, 1, 2] + # test special tokens are appended + assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], False) == [0, 1, 2, 8, 9, 10] + # test special tokens are prepended when not all words in file are used + assert test_config("w1 w2 w3 s1 s2 s3", 3, ["s1", "s2", "s3"], False) == [0, 1, 2, 3, 4, 5] + # text exception special_words contains duplicate words + assert "special_tokens contains duplicate" in test_config("w1", None, ["s1", "s1"], True) if __name__ == '__main__':