add CLUE dataset

5 years ago · e0e167a000
parent 3536185f5b
commit e0e167a000
33 changed files with 1676 additions and 12 deletions
--- a/mindspore/ccsrc/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.cc
@ -31,6 +31,7 @@
 #include "dataset/engine/datasetops/source/celeba_op.h"
 #include "dataset/engine/datasetops/source/random_data_op.h"
 #include "dataset/engine/datasetops/source/text_file_op.h"
+#include "dataset/engine/datasetops/source/clue_op.h"
 #include "dataset/engine/datasetops/filter_op.h"
 #include "mindrecord/include/shard_category.h"
 #include "mindrecord/include/shard_distributed_sample.h"
@ -72,7 +73,8 @@ static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {{kStorage, &D
                                                                   {kCelebA, &DEPipeline::ParseCelebAOp},
                                                                   {kRandomData, &DEPipeline::ParseRandomDataOp},
                                                                   {kTextFile, &DEPipeline::ParseTextFileOp},
-                                                                   {kBuildVocab, &DEPipeline::ParseBuildVocabOp}};
+                                                                   {kBuildVocab, &DEPipeline::ParseBuildVocabOp},
+                                                                   {kClue, &DEPipeline::ParseClueOp}};

 DEPipeline::DEPipeline() : iterator_(nullptr) {
  try {
@ -1210,6 +1212,7 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<Dataset
  *ptr = op;
  return Status::OK();
 }
+
 Status DEPipeline::ParsePadInfo(py::handle value, PadInfo *pad_info) {
  for (auto p : py::reinterpret_borrow<py::dict>(value)) {
    if (!p.second.is_none()) {
@ -1236,6 +1239,7 @@ Status DEPipeline::ParsePadInfo(py::handle value, PadInfo *pad_info) {
  }
  return Status::OK();
 }
+
 Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
  std::shared_ptr<BuildVocabOp::Builder> builder = std::make_shared<BuildVocabOp::Builder>();
  for (auto arg : args) {
@ -1267,5 +1271,45 @@ Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<Datas
  return Status::OK();
 }

+Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
+  std::shared_ptr<ClueOp::Builder> builder = std::make_shared<ClueOp::Builder>();
+  if (!args["dataset_files"].is_none()) {
+    (void)builder->SetClueFilesList(ToStringVector(args["dataset_files"]));
+  } else {
+    RETURN_STATUS_UNEXPECTED("Error: dataset_files is missing");
+  }
+  // Optional arguments
+  for (auto arg : args) {
+    std::string key = py::str(arg.first);
+    py::handle value = arg.second;
+    if (!value.is_none()) {
+      if (key == "num_parallel_workers") {
+        (void)builder->SetNumWorkers(ToInt(value));
+      } else if (key == "shuffle_files") {
+        (void)builder->SetShuffleFiles(ToBool(value));
+      } else if (key == "num_samples") {
+        (void)builder->SetNumSamples(ToInt(value));
+      } else if (key == "num_shards") {
+        (void)builder->SetNumDevices(ToInt(value));
+      } else if (key == "shard_id") {
+        (void)builder->SetDeviceId(ToInt(value));
+      } else if (key == "cols_to_keyword") {
+        std::map<std::string, std::string> map_dict;
+        for (auto p : py::reinterpret_borrow<py::dict>(value)) {
+          if (!p.second.is_none()) {
+            map_dict.insert({ToString(p.first), ToString(p.second)});
+          } else {
+            map_dict.insert({ToString(p.first), ToString(p.first)});
+          }
+        }
+        (void)builder->SetColsKeyMap(map_dict);
+      }
+    }
+  }
+  std::shared_ptr<ClueOp> op;
+  RETURN_IF_NOT_OK(builder->Build(&op));
+  *ptr = op;
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/api/de_pipeline.h
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.h
@ -64,7 +64,8 @@ enum OpName {
  kCelebA,
  kRandomData,
  kTextFile,
-  kBuildVocab
+  kBuildVocab,
+  kClue
 };

 // The C++ binder class that we expose to the python script.
@ -166,6 +167,8 @@ class DEPipeline {

  Status ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);

+  Status ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
+
 private:
  // Execution tree that links the dataset operators.
  std::shared_ptr<ExecutionTree> tree_;
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@ -55,6 +55,7 @@
 #include "dataset/engine/datasetops/source/tf_reader_op.h"
 #include "dataset/engine/jagged_connector.h"
 #include "dataset/engine/datasetops/source/text_file_op.h"
+#include "dataset/engine/datasetops/source/clue_op.h"
 #include "dataset/engine/datasetops/source/voc_op.h"
 #include "dataset/engine/datasetops/source/coco_op.h"
 #include "dataset/engine/gnn/graph.h"
@ -201,6 +202,18 @@ void bindDatasetOps(py::module *m) {
      THROW_IF_ERROR(TextFileOp::CountAllFileRows(filenames, &count));
      return count;
    });
+
+  (void)py::class_<ClueOp, DatasetOp, std::shared_ptr<ClueOp>>(*m, "ClueOp")
+    .def_static("get_num_rows", [](const py::list &files) {
+      int64_t count = 0;
+      std::vector<std::string> filenames;
+      for (auto file : files) {
+        file.is_none() ? (void)filenames.emplace_back("") : filenames.push_back(py::str(file));
+      }
+      THROW_IF_ERROR(ClueOp::CountAllFileRows(filenames, &count));
+      return count;
+    });
+
  (void)py::class_<VOCOp, DatasetOp, std::shared_ptr<VOCOp>>(*m, "VOCOp")
    .def_static("get_num_rows",
                [](const std::string &dir, const std::string &task_type, const std::string &task_mode,
@ -629,7 +642,8 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("RANDOMDATA", OpName::kRandomData)
    .value("BUILDVOCAB", OpName::kBuildVocab)
    .value("CELEBA", OpName::kCelebA)
-    .value("TEXTFILE", OpName::kTextFile);
+    .value("TEXTFILE", OpName::kTextFile)
+    .value("CLUE", OpName::kClue);

  (void)py::enum_<JiebaMode>(m, "JiebaMode", py::arithmetic())
    .value("DE_JIEBA_MIX", JiebaMode::kMix)
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
@ -19,4 +19,5 @@ add_library(engine-datasetops-source OBJECT
    random_data_op.cc
    celeba_op.cc
    text_file_op.cc
+    clue_op.cc
    )
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.h
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc
@ -43,7 +43,7 @@ TextFileOp::Builder::Builder()

 Status TextFileOp::Builder::ValidateInputs() const {
  std::string err_msg;
-  err_msg += builder_num_workers_ <= 0 ? "Number of parallel workers should be greate than 0\n" : "";
+  err_msg += builder_num_workers_ <= 0 ? "Number of parallel workers should be greater than 0\n" : "";
  err_msg += builder_device_id_ >= builder_num_devices_ || builder_num_devices_ < 1 ? "Wrong sharding configs\n" : "";
  return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
 }
--- a/mindspore/dataset/init.py
+++ b/mindspore/dataset/init.py
@ -21,7 +21,7 @@ can also create samplers with this module to sample data.
 from .core.configuration import config
 from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, NumpySlicesDataset, \
    GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CocoDataset, CelebADataset,\
-    TextFileDataset, Schema, Shuffle, zip, RandomDataset
+    TextFileDataset, CLUEDataset, Schema, Shuffle, zip, RandomDataset
 from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \
    WeightedRandomSampler, Sampler
 from .engine.serializer_deserializer import serialize, deserialize, show
@ -29,6 +29,6 @@ from .engine.graphdata import GraphData

 __all__ = ["config", "ImageFolderDatasetV2", "MnistDataset",
           "MindDataset", "GeneratorDataset", "TFRecordDataset",
-           "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset",
-           "VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler", "PKSampler", "RandomSampler",
-           "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]
+           "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset", "VOCDataset",
+           "CocoDataset", "TextFileDataset", "CLUEDataset", "Schema", "DistributedSampler", "PKSampler",
+           "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]
--- a/mindspore/dataset/engine/init.py
+++ b/mindspore/dataset/engine/init.py
@ -30,7 +30,7 @@ from ..core.configuration import config, ConfigurationManager

 __all__ = ["config", "ConfigurationManager", "zip",
           "ImageFolderDatasetV2", "MnistDataset",
-           "MindDataset", "GeneratorDataset", "TFRecordDataset",
+           "MindDataset", "GeneratorDataset", "TFRecordDataset", "CLUEDataset",
           "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
           "VOCDataset", "CocoDataset", "TextFileDataset", "BuildVocabDataset", "Schema", "Schema",
           "DistributedSampler", "PKSampler",
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -33,7 +33,7 @@ import copy
 import numpy as np

 from mindspore._c_dataengine import DataType, TFReaderOp, ImageFolderOp, CifarOp, MnistOp, ManifestOp, \
-    MindRecordOp, TextFileOp, VOCOp, CocoOp, CBatchInfo
+    MindRecordOp, TextFileOp, ClueOp, VOCOp, CocoOp, CBatchInfo
 from mindspore._c_expression import typing

 from mindspore import log as logger
@ -44,7 +44,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
    check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
    check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
    check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
-    check_split
+    check_split, check_cluedataset
 from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist

 try:
@ -4317,6 +4317,222 @@ class CelebADataset(MappableDataset):
        return self.sampler.is_sharded()


+class CLUEDataset(SourceDataset):
+    """
+    A source dataset that reads and parses CLUE datasets.
+    CLUE, the Chinese Language Understanding Evaluation Benchmark, a collection of datasets, baselines, pre-trained
+    models, corpus and leaderboard. Here we bring in classification task of CLUE, which are AFQMC, TNEWS, IFLYTEK,
+    CMNLI, WSC and CSL.
+
+    Args:
+        dataset_files (str or list[str]): String or list of files to be read or glob strings to search for a pattern of
+            files. The list will be sorted in a lexicographical order.
+        task (str, optional): The kind of task, one of 'AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC' and 'CSL'.
+            (default=AFQMC).
+        usage (str, optional): Need train, test or eval data (default="train").
+        num_samples (int, optional): number of samples(rows) to read (default=None, reads the full dataset).
+        num_parallel_workers (int, optional): number of workers to read the data
+            (default=None, number set in the config).
+        shuffle (bool, Shuffle level, optional): perform reshuffling of the data every epoch (default=Shuffle.GLOBAL).
+            If shuffle is False, no shuffling will be performed;
+            If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
+            Otherwise, there are two levels of shuffling:
+
+            - Shuffle.GLOBAL: Shuffle both the files and samples.
+
+            - Shuffle.FILES: Shuffle files only.
+
+        num_shards (int, optional): Number of shards that the dataset should be divided into (default=None).
+        shard_id (int, optional): The shard ID within num_shards (default=None). This
+            argument should be specified only when num_shards is also specified.
+
+    Examples:
+        >>> import mindspore.dataset as ds
+        >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
+        >>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train')
+
+    """
+
+    @check_cluedataset
+    def __init__(self, dataset_files, task='AFQMC', usage='train', num_samples=None,
+                 num_parallel_workers=None, shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None):
+        super().__init__(num_parallel_workers)
+        self.dataset_files = self._find_files(dataset_files)
+        self.dataset_files.sort()
+        self.num_samples = num_samples
+        self.task_dict = {
+            'AFQMC': {
+                'train': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2'
+                },
+                'eval': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                }
+            },
+            'CMNLI': {
+                'train': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2'
+                },
+                'eval': {
+                    'sentence1': 'sentence1',
+                    'sentence2': 'sentence2',
+                    'label': 'label'
+                }
+            },
+            'CSL': {
+                'train': {
+                    'id': 'id',
+                    'abst': 'abst',
+                    'keyword': 'keyword',
+                    'label': 'label'
+                },
+                'test': {
+                    'id': 'id',
+                    'abst': 'abst',
+                    'keyword': 'keyword'
+                },
+                'eval': {
+                    'id': 'id',
+                    'abst': 'abst',
+                    'keyword': 'keyword',
+                    'label': 'label'
+                }
+            },
+            'IFLYTEK': {
+                'train': {
+                    'label': 'label',
+                    'label_des': 'label_des',
+                    'sentence': 'sentence'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence': 'sentence',
+                },
+                'eval': {
+                    'label': 'label',
+                    'label_des': 'label_des',
+                    'sentence': 'sentence'
+                }
+            },
+            'TNEWS': {
+                'train': {
+                    'label': 'label',
+                    'label_desc': 'label_desc',
+                    'sentence': 'sentence',
+                    'keywords': 'keywords'
+                },
+                'test': {
+                    'id': 'id',
+                    'sentence': 'sentence',
+                    'keywords': 'keywords'
+                },
+                'eval': {
+                    'label': 'label',
+                    'label_desc': 'label_desc',
+                    'sentence': 'sentence',
+                    'keywords': 'keywords'
+                }
+            },
+            'WSC': {
+                'train': {
+                    'span1_index': 'target/span1_index',
+                    'span2_index': 'target/span2_index',
+                    'span1_text': 'target/span1_text',
+                    'span2_text': 'target/span2_text',
+                    'idx': 'idx',
+                    'label': 'label',
+                    'text': 'text'
+                },
+                'test': {
+                    'span1_index': 'target/span1_index',
+                    'span2_index': 'target/span2_index',
+                    'span1_text': 'target/span1_text',
+                    'span2_text': 'target/span2_text',
+                    'idx': 'idx',
+                    'text': 'text'
+                },
+                'eval': {
+                    'span1_index': 'target/span1_index',
+                    'span2_index': 'target/span2_index',
+                    'span1_text': 'target/span1_text',
+                    'span2_text': 'target/span2_text',
+                    'idx': 'idx',
+                    'label': 'label',
+                    'text': 'text'
+                }
+            }
+        }
+        self.cols_to_keyword = self.task_dict[task][usage]
+
+        if not isinstance(shuffle, (bool, Shuffle)):
+            raise TypeError("shuffle should be of boolean or enum 'Shuffle'.")
+        if not isinstance(shuffle, Shuffle):
+            if shuffle:
+                self.shuffle_level = Shuffle.GLOBAL
+                self.shuffle_files = True
+            else:
+                self.shuffle_level = None
+                self.shuffle_files = False
+        else:
+            self.shuffle_level = shuffle
+            self.shuffle_files = True
+
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+
+    def get_args(self):
+        args = super().get_args()
+        args["dataset_files"] = self.dataset_files
+        args["num_samples"] = self.num_samples
+        if self.shuffle_files is not None:
+            args["shuffle_files"] = self.shuffle_files
+        args["shuffle"] = self.shuffle_level
+        args["num_shards"] = self.num_shards
+        args["shard_id"] = self.shard_id
+        args["cols_to_keyword"] = self.cols_to_keyword
+        return args
+
+    def get_dataset_size(self):
+        """
+        Get the number of batches in an epoch.
+
+        Return:
+            Number, number of batches.
+        """
+        if self._dataset_size is None:
+            num_rows = ClueOp.get_num_rows(self.dataset_files)
+            num_rows = get_num_rows(num_rows, self.num_shards)
+            if self.num_samples is None:
+                return num_rows
+            return min(self.num_samples, num_rows)
+        return self._dataset_size
+
+    def is_shuffled(self):
+        return self.shuffle_files
+
+    def is_sharded(self):
+        if self.num_shards is not None:
+            return self.num_shards > 1
+
+        return False
+
+
 class TextFileDataset(SourceDataset):
    """
    A source dataset that reads and parses datasets stored on disk in text format.
--- a/mindspore/dataset/engine/iterators.py
+++ b/mindspore/dataset/engine/iterators.py
@ -50,7 +50,8 @@ def alter_tree(node):

 def _alter_node(node):
    """Performing some alteration to a dataset node. A common alteration is to insert a node."""
-    if isinstance(node, (de.TFRecordDataset, de.TextFileDataset)) and node.shuffle_level == de.Shuffle.GLOBAL:
+    if isinstance(node, (de.TFRecordDataset, de.TextFileDataset, de.CLUEDataset)) \
+        and node.shuffle_level == de.Shuffle.GLOBAL:
        # Remove the connection between the parent's node to the current node because we are inserting a node.
        if node.output:
            node.output.pop()
@ -179,6 +180,8 @@ class Iterator:
            op_type = OpName.TEXTFILE
        elif isinstance(dataset, de.BuildVocabDataset):
            op_type = OpName.BUILDVOCAB
+        elif isinstance(dataset, de.CLUEDataset):
+            op_type = OpName.CLUE
        else:
            raise ValueError("Unsupported DatasetOp")

--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@ -1075,6 +1075,41 @@ def check_add_column(method):
    return new_method


+def check_cluedataset(method):
+    """A wrapper that wrap a parameter checker to the original Dataset(CLUEDataset)."""
+
+    @wraps(method)
+    def new_method(*args, **kwargs):
+        param_dict = make_param_dict(method, args, kwargs)
+
+        nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
+
+        # check dataset_files; required argument
+        dataset_files = param_dict.get('dataset_files')
+        if dataset_files is None:
+            raise ValueError("dataset_files is not provided.")
+        if not isinstance(dataset_files, (str, list)):
+            raise TypeError("dataset_files should be of type str or a list of strings.")
+
+        # check task
+        task_param = param_dict.get('task')
+        if task_param not in ['AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC', 'CSL']:
+            raise ValueError("task should be AFQMC, TNEWS, IFLYTEK, CMNLI, WSC or CSL")
+
+        # check usage
+        usage_param = param_dict.get('usage')
+        if usage_param not in ['train', 'test', 'eval']:
+            raise ValueError("usage should be train, test or eval")
+
+        check_param_type(nreq_param_int, param_dict, int)
+
+        check_sampler_shuffle_shard_options(param_dict)
+
+        return method(*args, **kwargs)
+
+    return new_method
+
+
 def check_textfiledataset(method):
    """A wrapper that wrap a parameter checker to the original Dataset(TextFileDataset)."""

--- a/tests/ut/cpp/dataset/CMakeLists.txt
+++ b/tests/ut/cpp/dataset/CMakeLists.txt
@ -65,6 +65,7 @@ SET(DE_UT_SRCS
    cifar_op_test.cc
    celeba_op_test.cc
    take_op_test.cc
+    clue_op_test.cc
    text_file_op_test.cc
    filter_op_test.cc
    concat_op_test.cc
--- a/tests/ut/cpp/dataset/clue_op_test.cc
+++ b/tests/ut/cpp/dataset/clue_op_test.cc
@ -0,0 +1,117 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "dataset/core/client.h"
+#include "common/common.h"
+#include "common/utils.h"
+#include "gtest/gtest.h"
+#include "utils/log_adapter.h"
+#include "dataset/engine/datasetops/source/clue_op.h"
+#include "dataset/util/status.h"
+
+namespace common = mindspore::common;
+
+using namespace mindspore::dataset;
+using mindspore::MsLogLevel::INFO;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::LogStream;
+
+class MindDataTestCLUEOp : public UT::DatasetOpTesting {
+
+};
+
+TEST_F(MindDataTestCLUEOp, TestCLUEBasic) {
+  // Start with an empty execution tree
+  auto tree = std::make_shared<ExecutionTree>();
+
+  std::string dataset_path;
+  dataset_path = datasets_root_path_ + "/testCLUE/afqmc/train.json";
+  std::map<std::string, std::string> key_map;
+  key_map["sentence1"] = "sentence1";
+  key_map["sentence2"] = "sentence2";
+  key_map["label"] = "label";
+
+  std::shared_ptr<ClueOp> op;
+  ClueOp::Builder builder;
+  builder.SetClueFilesList({dataset_path})
+      .SetRowsPerBuffer(16)
+      .SetNumWorkers(16)
+      .SetOpConnectorSize(2)
+      .SetColsKeyMap(key_map);
+
+  Status rc = builder.Build(&op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = tree->AssociateNode(op);
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = tree->AssignRoot(op);
+  ASSERT_TRUE(rc.IsOk());
+
+  MS_LOG(INFO) << "Launching tree and begin iteration.";
+  rc = tree->Prepare();
+  ASSERT_TRUE(rc.IsOk());
+
+  rc = tree->Launch();
+  ASSERT_TRUE(rc.IsOk());
+
+  // Start the loop of reading tensors from our pipeline
+  DatasetIterator di(tree);
+  TensorRow tensor_list;
+  rc = di.FetchNextTensorRow(&tensor_list);
+  ASSERT_TRUE(rc.IsOk());
+
+  int row_count = 0;
+  while (!tensor_list.empty()) {
+    // Display the tensor by calling the printer on it
+    for (int i = 0; i < tensor_list.size(); i++) {
+      std::ostringstream ss;
+      ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
+      MS_LOG(INFO) << "Tensor print: " << ss.str() << ".";
+    }
+
+    rc = di.FetchNextTensorRow(&tensor_list);
+    ASSERT_TRUE(rc.IsOk());
+    row_count++;
+  }
+
+  ASSERT_EQ(row_count, 3);
+}
+
+TEST_F(MindDataTestCLUEOp, TestTotalRows) {
+  std::string tf_file1 = datasets_root_path_ + "/testCLUE/afqmc/train.json";
+  std::string tf_file2 = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
+  std::vector<std::string> files;
+  files.push_back(tf_file1);
+  int64_t total_rows = 0;
+  ClueOp::CountAllFileRows(files, &total_rows);
+  ASSERT_EQ(total_rows, 3);
+  files.clear();
+
+  files.push_back(tf_file2);
+  ClueOp::CountAllFileRows(files, &total_rows);
+  ASSERT_EQ(total_rows, 3);
+  files.clear();
+
+  files.push_back(tf_file1);
+  files.push_back(tf_file2);
+  ClueOp::CountAllFileRows(files, &total_rows);
+  ASSERT_EQ(total_rows, 6);
+  files.clear();
+}
--- a/tests/ut/data/dataset/testCLUE/afqmc/dev.json
+++ b/tests/ut/data/dataset/testCLUE/afqmc/dev.json
@ -0,0 +1,3 @@
+{"sentence1": "你有花呗吗", "sentence2": "我的花呗没额度了", "label": "0"}
+{"sentence1": "吃饭能用花呗吗", "sentence2": "花呗太方便了", "label": "0"}
+{"sentence1": "蚂蚁花呗支付金额有什么限制", "sentence2": "我到实体店消费用花呗支付受金额限制", "label": "1"}
--- a/tests/ut/data/dataset/testCLUE/afqmc/test.json
+++ b/tests/ut/data/dataset/testCLUE/afqmc/test.json
@ -0,0 +1,3 @@
+{"id": 0, "sentence1": "借呗取消的时间", "sentence2": "蚂蚁借呗恢复的月数"}
+{"id": 1, "sentence1": "网商贷用什么方法转变成借呗", "sentence2": "什么手段能将网商贷切换为借呗"}
+{"id": 2, "sentence1": "我的借呗为什么开通不了", "sentence2": "我为啥没法开通借呗"}
--- a/tests/ut/data/dataset/testCLUE/afqmc/train.json
+++ b/tests/ut/data/dataset/testCLUE/afqmc/train.json
@ -0,0 +1,3 @@
+{"sentence1": "蚂蚁借呗等额还款能否换成先息后本", "sentence2": "借呗可以先息到期还本吗", "label": "0"}
+{"sentence1": "蚂蚁花呗说我违约了", "sentence2": "蚂蚁花呗违约行为是啥", "label": "0"}
+{"sentence1": "帮我看看本月花呗账单结清了没", "sentence2": "上月的花呗账单", "label": "0"}
--- a/tests/ut/data/dataset/testCLUE/cmnli/dev.json
+++ b/tests/ut/data/dataset/testCLUE/cmnli/dev.json
@ -0,0 +1,3 @@
+{"sentence1": "每个人都有权利", "sentence2": "每个人都有福利", "label": "neutral"}
+{"sentence1": "有时候我喜欢他，但我也喜欢看到有人打他", "sentence2": "说实话，我有点喜欢他，但还是喜欢看到有人打他。", "label": "entailment"}
+{"sentence1": "我最喜欢的餐馆是离你最近的一家", "sentence2": "我最喜欢的餐馆离你家至少一百英里远。", "label": "contradiction"}
--- a/tests/ut/data/dataset/testCLUE/cmnli/test.json
+++ b/tests/ut/data/dataset/testCLUE/cmnli/test.json
@ -0,0 +1,3 @@
+{"id": 0, "sentence1": "今天，全球都在看着最新航天飞机的处女航。", "sentence2": "全世界都在看最新的航天飞机发射。"}
+{"id": 1, "sentence1": "而我们把竹篮放在一个地方，把玻璃瓶放在另一处，把书放在另一处，满了要把它放到车里", "sentence2": "我们没有分开任何东西，都把它全扔进一个箱子里。"}
+{"id": 2, "sentence1": "她占用了我的很多时间，她给我读了很多关于灵异的故事，我觉得很无聊。", "sentence2": "我喜欢和她一起读鬼故事。"}
--- a/tests/ut/data/dataset/testCLUE/cmnli/train.json
+++ b/tests/ut/data/dataset/testCLUE/cmnli/train.json
@ -0,0 +1,3 @@
+{"sentence1": "你应该给这件衣服定一个价格。", "sentence2": "不同的衣服有不同的价格。", "label": "neutral"}
+{"sentence1": "我怎么知道他要说什么", "sentence2": "他说什么我并不知道。", "label": "entailment"}
+{"sentence1": "向左。", "sentence2": "向右。", "label": "contradiction"}
--- a/tests/ut/data/dataset/testCLUE/csl/dev.json
+++ b/tests/ut/data/dataset/testCLUE/csl/dev.json
@ -0,0 +1,3 @@
+{"id": 1, "abst": "这是第一段很长的文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "1"}
+{"id": 2, "abst": "这是第二段很长的文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "1"}
+{"id": 3, "abst": "这是第三段很长的文本", "keyword": ["1", "2", "3"], "label": "0"}
--- a/tests/ut/data/dataset/testCLUE/csl/test.json
+++ b/tests/ut/data/dataset/testCLUE/csl/test.json
@ -0,0 +1,3 @@
+{"id": 2415, "abst": "长文本1", "keyword": ["关键词1", "关键词2"]}
+{"id": 2565, "abst": "长文本2", "keyword": ["关键词1", "关键词2", "关键词3"]}
+{"id": 2625, "abst": "长文本3", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"]}
--- a/tests/ut/data/dataset/testCLUE/csl/train.json
+++ b/tests/ut/data/dataset/testCLUE/csl/train.json
@ -0,0 +1,3 @@
+{"id": 1, "abst": "这是一段长文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "0"}
+{"id": 2, "abst": "这是一段长文本", "keyword": ["关键词5", "关键词6", "关键词7", "关键词8"], "label": "0"}
+{"id": 3, "abst": "这是一段长文本", "keyword": ["关键词9", "关键词10", "关键词11", "关键词12"], "label": "0"}
--- a/tests/ut/data/dataset/testCLUE/iflytek/dev.json
+++ b/tests/ut/data/dataset/testCLUE/iflytek/dev.json
@ -0,0 +1,3 @@
+{"label": "110", "label_des": "社区超市", "sentence": "这是第一段文本"}
+{"label": "70", "label_des": "工具", "sentence": "这是第二段文本"}
+{"label": "10", "label_des": "社区服务", "sentence": "这是第三段文本"}
--- a/tests/ut/data/dataset/testCLUE/iflytek/test.json
+++ b/tests/ut/data/dataset/testCLUE/iflytek/test.json
@ -0,0 +1,3 @@
+{"id": 0, "sentence": "文本1"}
+{"id": 1, "sentence": "文本2"}
+{"id": 2, "sentence": "文本3"}
--- a/Show More
+++ b/Show More