add CLUE dataset

pull/1932/head
jiangzhiwen 5 years ago
parent 3536185f5b
commit e0e167a000

@ -31,6 +31,7 @@
#include "dataset/engine/datasetops/source/celeba_op.h"
#include "dataset/engine/datasetops/source/random_data_op.h"
#include "dataset/engine/datasetops/source/text_file_op.h"
#include "dataset/engine/datasetops/source/clue_op.h"
#include "dataset/engine/datasetops/filter_op.h"
#include "mindrecord/include/shard_category.h"
#include "mindrecord/include/shard_distributed_sample.h"
@ -72,7 +73,8 @@ static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {{kStorage, &D
{kCelebA, &DEPipeline::ParseCelebAOp},
{kRandomData, &DEPipeline::ParseRandomDataOp},
{kTextFile, &DEPipeline::ParseTextFileOp},
{kBuildVocab, &DEPipeline::ParseBuildVocabOp}};
{kBuildVocab, &DEPipeline::ParseBuildVocabOp},
{kClue, &DEPipeline::ParseClueOp}};
DEPipeline::DEPipeline() : iterator_(nullptr) {
try {
@ -1210,6 +1212,7 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<Dataset
*ptr = op;
return Status::OK();
}
Status DEPipeline::ParsePadInfo(py::handle value, PadInfo *pad_info) {
for (auto p : py::reinterpret_borrow<py::dict>(value)) {
if (!p.second.is_none()) {
@ -1236,6 +1239,7 @@ Status DEPipeline::ParsePadInfo(py::handle value, PadInfo *pad_info) {
}
return Status::OK();
}
Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
std::shared_ptr<BuildVocabOp::Builder> builder = std::make_shared<BuildVocabOp::Builder>();
for (auto arg : args) {
@ -1267,5 +1271,45 @@ Status DEPipeline::ParseBuildVocabOp(const py::dict &args, std::shared_ptr<Datas
return Status::OK();
}
Status DEPipeline::ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
std::shared_ptr<ClueOp::Builder> builder = std::make_shared<ClueOp::Builder>();
if (!args["dataset_files"].is_none()) {
(void)builder->SetClueFilesList(ToStringVector(args["dataset_files"]));
} else {
RETURN_STATUS_UNEXPECTED("Error: dataset_files is missing");
}
// Optional arguments
for (auto arg : args) {
std::string key = py::str(arg.first);
py::handle value = arg.second;
if (!value.is_none()) {
if (key == "num_parallel_workers") {
(void)builder->SetNumWorkers(ToInt(value));
} else if (key == "shuffle_files") {
(void)builder->SetShuffleFiles(ToBool(value));
} else if (key == "num_samples") {
(void)builder->SetNumSamples(ToInt(value));
} else if (key == "num_shards") {
(void)builder->SetNumDevices(ToInt(value));
} else if (key == "shard_id") {
(void)builder->SetDeviceId(ToInt(value));
} else if (key == "cols_to_keyword") {
std::map<std::string, std::string> map_dict;
for (auto p : py::reinterpret_borrow<py::dict>(value)) {
if (!p.second.is_none()) {
map_dict.insert({ToString(p.first), ToString(p.second)});
} else {
map_dict.insert({ToString(p.first), ToString(p.first)});
}
}
(void)builder->SetColsKeyMap(map_dict);
}
}
}
std::shared_ptr<ClueOp> op;
RETURN_IF_NOT_OK(builder->Build(&op));
*ptr = op;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

@ -64,7 +64,8 @@ enum OpName {
kCelebA,
kRandomData,
kTextFile,
kBuildVocab
kBuildVocab,
kClue
};
// The C++ binder class that we expose to the python script.
@ -166,6 +167,8 @@ class DEPipeline {
Status ParseBuildVocabOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
Status ParseClueOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
private:
// Execution tree that links the dataset operators.
std::shared_ptr<ExecutionTree> tree_;

@ -55,6 +55,7 @@
#include "dataset/engine/datasetops/source/tf_reader_op.h"
#include "dataset/engine/jagged_connector.h"
#include "dataset/engine/datasetops/source/text_file_op.h"
#include "dataset/engine/datasetops/source/clue_op.h"
#include "dataset/engine/datasetops/source/voc_op.h"
#include "dataset/engine/datasetops/source/coco_op.h"
#include "dataset/engine/gnn/graph.h"
@ -201,6 +202,18 @@ void bindDatasetOps(py::module *m) {
THROW_IF_ERROR(TextFileOp::CountAllFileRows(filenames, &count));
return count;
});
(void)py::class_<ClueOp, DatasetOp, std::shared_ptr<ClueOp>>(*m, "ClueOp")
.def_static("get_num_rows", [](const py::list &files) {
int64_t count = 0;
std::vector<std::string> filenames;
for (auto file : files) {
file.is_none() ? (void)filenames.emplace_back("") : filenames.push_back(py::str(file));
}
THROW_IF_ERROR(ClueOp::CountAllFileRows(filenames, &count));
return count;
});
(void)py::class_<VOCOp, DatasetOp, std::shared_ptr<VOCOp>>(*m, "VOCOp")
.def_static("get_num_rows",
[](const std::string &dir, const std::string &task_type, const std::string &task_mode,
@ -629,7 +642,8 @@ PYBIND11_MODULE(_c_dataengine, m) {
.value("RANDOMDATA", OpName::kRandomData)
.value("BUILDVOCAB", OpName::kBuildVocab)
.value("CELEBA", OpName::kCelebA)
.value("TEXTFILE", OpName::kTextFile);
.value("TEXTFILE", OpName::kTextFile)
.value("CLUE", OpName::kClue);
(void)py::enum_<JiebaMode>(m, "JiebaMode", py::arithmetic())
.value("DE_JIEBA_MIX", JiebaMode::kMix)

@ -19,4 +19,5 @@ add_library(engine-datasetops-source OBJECT
random_data_op.cc
celeba_op.cc
text_file_op.cc
clue_op.cc
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -43,7 +43,7 @@ TextFileOp::Builder::Builder()
Status TextFileOp::Builder::ValidateInputs() const {
std::string err_msg;
err_msg += builder_num_workers_ <= 0 ? "Number of parallel workers should be greate than 0\n" : "";
err_msg += builder_num_workers_ <= 0 ? "Number of parallel workers should be greater than 0\n" : "";
err_msg += builder_device_id_ >= builder_num_devices_ || builder_num_devices_ < 1 ? "Wrong sharding configs\n" : "";
return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
}

@ -21,7 +21,7 @@ can also create samplers with this module to sample data.
from .core.configuration import config
from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, NumpySlicesDataset, \
GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CocoDataset, CelebADataset,\
TextFileDataset, Schema, Shuffle, zip, RandomDataset
TextFileDataset, CLUEDataset, Schema, Shuffle, zip, RandomDataset
from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \
WeightedRandomSampler, Sampler
from .engine.serializer_deserializer import serialize, deserialize, show
@ -29,6 +29,6 @@ from .engine.graphdata import GraphData
__all__ = ["config", "ImageFolderDatasetV2", "MnistDataset",
"MindDataset", "GeneratorDataset", "TFRecordDataset",
"ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset",
"VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler", "PKSampler", "RandomSampler",
"SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]
"ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset", "VOCDataset",
"CocoDataset", "TextFileDataset", "CLUEDataset", "Schema", "DistributedSampler", "PKSampler",
"RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]

@ -30,7 +30,7 @@ from ..core.configuration import config, ConfigurationManager
__all__ = ["config", "ConfigurationManager", "zip",
"ImageFolderDatasetV2", "MnistDataset",
"MindDataset", "GeneratorDataset", "TFRecordDataset",
"MindDataset", "GeneratorDataset", "TFRecordDataset", "CLUEDataset",
"ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
"VOCDataset", "CocoDataset", "TextFileDataset", "BuildVocabDataset", "Schema", "Schema",
"DistributedSampler", "PKSampler",

@ -33,7 +33,7 @@ import copy
import numpy as np
from mindspore._c_dataengine import DataType, TFReaderOp, ImageFolderOp, CifarOp, MnistOp, ManifestOp, \
MindRecordOp, TextFileOp, VOCOp, CocoOp, CBatchInfo
MindRecordOp, TextFileOp, ClueOp, VOCOp, CocoOp, CBatchInfo
from mindspore._c_expression import typing
from mindspore import log as logger
@ -44,7 +44,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
check_split
check_split, check_cluedataset
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
try:
@ -4317,6 +4317,222 @@ class CelebADataset(MappableDataset):
return self.sampler.is_sharded()
class CLUEDataset(SourceDataset):
"""
A source dataset that reads and parses CLUE datasets.
CLUE, the Chinese Language Understanding Evaluation Benchmark, a collection of datasets, baselines, pre-trained
models, corpus and leaderboard. Here we bring in classification task of CLUE, which are AFQMC, TNEWS, IFLYTEK,
CMNLI, WSC and CSL.
Args:
dataset_files (str or list[str]): String or list of files to be read or glob strings to search for a pattern of
files. The list will be sorted in a lexicographical order.
task (str, optional): The kind of task, one of 'AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC' and 'CSL'.
(default=AFQMC).
usage (str, optional): Need train, test or eval data (default="train").
num_samples (int, optional): number of samples(rows) to read (default=None, reads the full dataset).
num_parallel_workers (int, optional): number of workers to read the data
(default=None, number set in the config).
shuffle (bool, Shuffle level, optional): perform reshuffling of the data every epoch (default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:
- Shuffle.GLOBAL: Shuffle both the files and samples.
- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset should be divided into (default=None).
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument should be specified only when num_shards is also specified.
Examples:
>>> import mindspore.dataset as ds
>>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files
>>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train')
"""
@check_cluedataset
def __init__(self, dataset_files, task='AFQMC', usage='train', num_samples=None,
num_parallel_workers=None, shuffle=Shuffle.GLOBAL, num_shards=None, shard_id=None):
super().__init__(num_parallel_workers)
self.dataset_files = self._find_files(dataset_files)
self.dataset_files.sort()
self.num_samples = num_samples
self.task_dict = {
'AFQMC': {
'train': {
'sentence1': 'sentence1',
'sentence2': 'sentence2',
'label': 'label'
},
'test': {
'id': 'id',
'sentence1': 'sentence1',
'sentence2': 'sentence2'
},
'eval': {
'sentence1': 'sentence1',
'sentence2': 'sentence2',
'label': 'label'
}
},
'CMNLI': {
'train': {
'sentence1': 'sentence1',
'sentence2': 'sentence2',
'label': 'label'
},
'test': {
'id': 'id',
'sentence1': 'sentence1',
'sentence2': 'sentence2'
},
'eval': {
'sentence1': 'sentence1',
'sentence2': 'sentence2',
'label': 'label'
}
},
'CSL': {
'train': {
'id': 'id',
'abst': 'abst',
'keyword': 'keyword',
'label': 'label'
},
'test': {
'id': 'id',
'abst': 'abst',
'keyword': 'keyword'
},
'eval': {
'id': 'id',
'abst': 'abst',
'keyword': 'keyword',
'label': 'label'
}
},
'IFLYTEK': {
'train': {
'label': 'label',
'label_des': 'label_des',
'sentence': 'sentence'
},
'test': {
'id': 'id',
'sentence': 'sentence',
},
'eval': {
'label': 'label',
'label_des': 'label_des',
'sentence': 'sentence'
}
},
'TNEWS': {
'train': {
'label': 'label',
'label_desc': 'label_desc',
'sentence': 'sentence',
'keywords': 'keywords'
},
'test': {
'id': 'id',
'sentence': 'sentence',
'keywords': 'keywords'
},
'eval': {
'label': 'label',
'label_desc': 'label_desc',
'sentence': 'sentence',
'keywords': 'keywords'
}
},
'WSC': {
'train': {
'span1_index': 'target/span1_index',
'span2_index': 'target/span2_index',
'span1_text': 'target/span1_text',
'span2_text': 'target/span2_text',
'idx': 'idx',
'label': 'label',
'text': 'text'
},
'test': {
'span1_index': 'target/span1_index',
'span2_index': 'target/span2_index',
'span1_text': 'target/span1_text',
'span2_text': 'target/span2_text',
'idx': 'idx',
'text': 'text'
},
'eval': {
'span1_index': 'target/span1_index',
'span2_index': 'target/span2_index',
'span1_text': 'target/span1_text',
'span2_text': 'target/span2_text',
'idx': 'idx',
'label': 'label',
'text': 'text'
}
}
}
self.cols_to_keyword = self.task_dict[task][usage]
if not isinstance(shuffle, (bool, Shuffle)):
raise TypeError("shuffle should be of boolean or enum 'Shuffle'.")
if not isinstance(shuffle, Shuffle):
if shuffle:
self.shuffle_level = Shuffle.GLOBAL
self.shuffle_files = True
else:
self.shuffle_level = None
self.shuffle_files = False
else:
self.shuffle_level = shuffle
self.shuffle_files = True
self.num_shards = num_shards
self.shard_id = shard_id
def get_args(self):
args = super().get_args()
args["dataset_files"] = self.dataset_files
args["num_samples"] = self.num_samples
if self.shuffle_files is not None:
args["shuffle_files"] = self.shuffle_files
args["shuffle"] = self.shuffle_level
args["num_shards"] = self.num_shards
args["shard_id"] = self.shard_id
args["cols_to_keyword"] = self.cols_to_keyword
return args
def get_dataset_size(self):
"""
Get the number of batches in an epoch.
Return:
Number, number of batches.
"""
if self._dataset_size is None:
num_rows = ClueOp.get_num_rows(self.dataset_files)
num_rows = get_num_rows(num_rows, self.num_shards)
if self.num_samples is None:
return num_rows
return min(self.num_samples, num_rows)
return self._dataset_size
def is_shuffled(self):
return self.shuffle_files
def is_sharded(self):
if self.num_shards is not None:
return self.num_shards > 1
return False
class TextFileDataset(SourceDataset):
"""
A source dataset that reads and parses datasets stored on disk in text format.

@ -50,7 +50,8 @@ def alter_tree(node):
def _alter_node(node):
"""Performing some alteration to a dataset node. A common alteration is to insert a node."""
if isinstance(node, (de.TFRecordDataset, de.TextFileDataset)) and node.shuffle_level == de.Shuffle.GLOBAL:
if isinstance(node, (de.TFRecordDataset, de.TextFileDataset, de.CLUEDataset)) \
and node.shuffle_level == de.Shuffle.GLOBAL:
# Remove the connection between the parent's node to the current node because we are inserting a node.
if node.output:
node.output.pop()
@ -179,6 +180,8 @@ class Iterator:
op_type = OpName.TEXTFILE
elif isinstance(dataset, de.BuildVocabDataset):
op_type = OpName.BUILDVOCAB
elif isinstance(dataset, de.CLUEDataset):
op_type = OpName.CLUE
else:
raise ValueError("Unsupported DatasetOp")

@ -1075,6 +1075,41 @@ def check_add_column(method):
return new_method
def check_cluedataset(method):
"""A wrapper that wrap a parameter checker to the original Dataset(CLUEDataset)."""
@wraps(method)
def new_method(*args, **kwargs):
param_dict = make_param_dict(method, args, kwargs)
nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
# check dataset_files; required argument
dataset_files = param_dict.get('dataset_files')
if dataset_files is None:
raise ValueError("dataset_files is not provided.")
if not isinstance(dataset_files, (str, list)):
raise TypeError("dataset_files should be of type str or a list of strings.")
# check task
task_param = param_dict.get('task')
if task_param not in ['AFQMC', 'TNEWS', 'IFLYTEK', 'CMNLI', 'WSC', 'CSL']:
raise ValueError("task should be AFQMC, TNEWS, IFLYTEK, CMNLI, WSC or CSL")
# check usage
usage_param = param_dict.get('usage')
if usage_param not in ['train', 'test', 'eval']:
raise ValueError("usage should be train, test or eval")
check_param_type(nreq_param_int, param_dict, int)
check_sampler_shuffle_shard_options(param_dict)
return method(*args, **kwargs)
return new_method
def check_textfiledataset(method):
"""A wrapper that wrap a parameter checker to the original Dataset(TextFileDataset)."""

@ -65,6 +65,7 @@ SET(DE_UT_SRCS
cifar_op_test.cc
celeba_op_test.cc
take_op_test.cc
clue_op_test.cc
text_file_op_test.cc
filter_op_test.cc
concat_op_test.cc

@ -0,0 +1,117 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <memory>
#include <vector>
#include "dataset/core/client.h"
#include "common/common.h"
#include "common/utils.h"
#include "gtest/gtest.h"
#include "utils/log_adapter.h"
#include "dataset/engine/datasetops/source/clue_op.h"
#include "dataset/util/status.h"
namespace common = mindspore::common;
using namespace mindspore::dataset;
using mindspore::MsLogLevel::INFO;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::LogStream;
class MindDataTestCLUEOp : public UT::DatasetOpTesting {
};
TEST_F(MindDataTestCLUEOp, TestCLUEBasic) {
// Start with an empty execution tree
auto tree = std::make_shared<ExecutionTree>();
std::string dataset_path;
dataset_path = datasets_root_path_ + "/testCLUE/afqmc/train.json";
std::map<std::string, std::string> key_map;
key_map["sentence1"] = "sentence1";
key_map["sentence2"] = "sentence2";
key_map["label"] = "label";
std::shared_ptr<ClueOp> op;
ClueOp::Builder builder;
builder.SetClueFilesList({dataset_path})
.SetRowsPerBuffer(16)
.SetNumWorkers(16)
.SetOpConnectorSize(2)
.SetColsKeyMap(key_map);
Status rc = builder.Build(&op);
ASSERT_TRUE(rc.IsOk());
rc = tree->AssociateNode(op);
ASSERT_TRUE(rc.IsOk());
rc = tree->AssignRoot(op);
ASSERT_TRUE(rc.IsOk());
MS_LOG(INFO) << "Launching tree and begin iteration.";
rc = tree->Prepare();
ASSERT_TRUE(rc.IsOk());
rc = tree->Launch();
ASSERT_TRUE(rc.IsOk());
// Start the loop of reading tensors from our pipeline
DatasetIterator di(tree);
TensorRow tensor_list;
rc = di.FetchNextTensorRow(&tensor_list);
ASSERT_TRUE(rc.IsOk());
int row_count = 0;
while (!tensor_list.empty()) {
// Display the tensor by calling the printer on it
for (int i = 0; i < tensor_list.size(); i++) {
std::ostringstream ss;
ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
MS_LOG(INFO) << "Tensor print: " << ss.str() << ".";
}
rc = di.FetchNextTensorRow(&tensor_list);
ASSERT_TRUE(rc.IsOk());
row_count++;
}
ASSERT_EQ(row_count, 3);
}
TEST_F(MindDataTestCLUEOp, TestTotalRows) {
std::string tf_file1 = datasets_root_path_ + "/testCLUE/afqmc/train.json";
std::string tf_file2 = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
std::vector<std::string> files;
files.push_back(tf_file1);
int64_t total_rows = 0;
ClueOp::CountAllFileRows(files, &total_rows);
ASSERT_EQ(total_rows, 3);
files.clear();
files.push_back(tf_file2);
ClueOp::CountAllFileRows(files, &total_rows);
ASSERT_EQ(total_rows, 3);
files.clear();
files.push_back(tf_file1);
files.push_back(tf_file2);
ClueOp::CountAllFileRows(files, &total_rows);
ASSERT_EQ(total_rows, 6);
files.clear();
}

@ -0,0 +1,3 @@
{"sentence1": "你有花呗吗", "sentence2": "我的花呗没额度了", "label": "0"}
{"sentence1": "吃饭能用花呗吗", "sentence2": "花呗太方便了", "label": "0"}
{"sentence1": "蚂蚁花呗支付金额有什么限制", "sentence2": "我到实体店消费用花呗支付受金额限制", "label": "1"}

@ -0,0 +1,3 @@
{"id": 0, "sentence1": "借呗取消的时间", "sentence2": "蚂蚁借呗恢复的月数"}
{"id": 1, "sentence1": "网商贷用什么方法转变成借呗", "sentence2": "什么手段能将网商贷切换为借呗"}
{"id": 2, "sentence1": "我的借呗为什么开通不了", "sentence2": "我为啥没法开通借呗"}

@ -0,0 +1,3 @@
{"sentence1": "蚂蚁借呗等额还款能否换成先息后本", "sentence2": "借呗可以先息到期还本吗", "label": "0"}
{"sentence1": "蚂蚁花呗说我违约了", "sentence2": "蚂蚁花呗违约行为是啥", "label": "0"}
{"sentence1": "帮我看看本月花呗账单结清了没", "sentence2": "上月的花呗账单", "label": "0"}

@ -0,0 +1,3 @@
{"sentence1": "每个人都有权利", "sentence2": "每个人都有福利", "label": "neutral"}
{"sentence1": "有时候我喜欢他,但我也喜欢看到有人打他", "sentence2": "说实话,我有点喜欢他,但还是喜欢看到有人打他。", "label": "entailment"}
{"sentence1": "我最喜欢的餐馆是离你最近的一家", "sentence2": "我最喜欢的餐馆离你家至少一百英里远。", "label": "contradiction"}

@ -0,0 +1,3 @@
{"id": 0, "sentence1": "今天,全球都在看着最新航天飞机的处女航。", "sentence2": "全世界都在看最新的航天飞机发射。"}
{"id": 1, "sentence1": "而我们把竹篮放在一个地方,把玻璃瓶放在另一处,把书放在另一处,满了要把它放到车里", "sentence2": "我们没有分开任何东西,都把它全扔进一个箱子里。"}
{"id": 2, "sentence1": "她占用了我的很多时间,她给我读了很多关于灵异的故事,我觉得很无聊。", "sentence2": "我喜欢和她一起读鬼故事。"}

@ -0,0 +1,3 @@
{"sentence1": "你应该给这件衣服定一个价格。", "sentence2": "不同的衣服有不同的价格。", "label": "neutral"}
{"sentence1": "我怎么知道他要说什么", "sentence2": "他说什么我并不知道。", "label": "entailment"}
{"sentence1": "向左。", "sentence2": "向右。", "label": "contradiction"}

@ -0,0 +1,3 @@
{"id": 1, "abst": "这是第一段很长的文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "1"}
{"id": 2, "abst": "这是第二段很长的文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "1"}
{"id": 3, "abst": "这是第三段很长的文本", "keyword": ["1", "2", "3"], "label": "0"}

@ -0,0 +1,3 @@
{"id": 2415, "abst": "长文本1", "keyword": ["关键词1", "关键词2"]}
{"id": 2565, "abst": "长文本2", "keyword": ["关键词1", "关键词2", "关键词3"]}
{"id": 2625, "abst": "长文本3", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"]}

@ -0,0 +1,3 @@
{"id": 1, "abst": "这是一段长文本", "keyword": ["关键词1", "关键词2", "关键词3", "关键词4"], "label": "0"}
{"id": 2, "abst": "这是一段长文本", "keyword": ["关键词5", "关键词6", "关键词7", "关键词8"], "label": "0"}
{"id": 3, "abst": "这是一段长文本", "keyword": ["关键词9", "关键词10", "关键词11", "关键词12"], "label": "0"}

@ -0,0 +1,3 @@
{"label": "110", "label_des": "社区超市", "sentence": "这是第一段文本"}
{"label": "70", "label_des": "工具", "sentence": "这是第二段文本"}
{"label": "10", "label_des": "社区服务", "sentence": "这是第三段文本"}

@ -0,0 +1,3 @@
{"id": 0, "sentence": "文本1"}
{"id": 1, "sentence": "文本2"}
{"id": 2, "sentence": "文本3"}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save