You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py

200 lines
7.9 KiB

# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class Segment(object):
def __init__(self, chunk_type, start_idx, end_idx):
self.chunk_type = chunk_type
self.start_idx = start_idx
self.end_idx = end_idx
def __str__(self):
return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx,
self.end_idx)
__repr__ = __str__
class TestChunkEvalOp(OpTest):
num_sequences = 5
batch_size = 50
def parse_scheme(self):
if self.scheme == 'IOB':
self.num_tag_types = 2
elif self.scheme == 'IOE':
self.num_tag_types = 2
def fill_with_chunks(self, data, chunks):
for chunk in chunks:
if self.scheme == 'IOB':
data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
data[chunk.start_idx + 1:
chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
self.num_tag_types - 1)
data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
self.num_tag_types - 1
) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
elif self.scheme == 'IOE':
data[chunk.start_idx:
chunk.end_idx] = chunk.chunk_type * self.num_tag_types
data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
self.num_tag_types - 1)
def rand_chunks(self, starts, num_chunks):
if num_chunks < 0:
num_chunks = np.random.randint(starts[-1])
chunks = []
# generate chunk beginnings
chunk_begins = sorted(
np.random.choice(
range(starts[-1]), num_chunks, replace=False))
seq_chunk_begins = []
begin_idx = 0
# divide chunks into sequences
for i in range(len(starts) - 1):
tmp_chunk_begins = []
while begin_idx < len(chunk_begins) and chunk_begins[
begin_idx] < starts[i + 1]:
tmp_chunk_begins.append(chunk_begins[begin_idx])
begin_idx += 1
seq_chunk_begins.append(tmp_chunk_begins)
# generate chunk ends
chunk_ends = []
for i in range(len(seq_chunk_begins)):
for j in range(len(seq_chunk_begins[i])):
low = seq_chunk_begins[i][j]
high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
i]) - 1 else starts[i + 1]
chunk_ends.append(np.random.randint(low, high))
# generate chunks
for chunk_pos in zip(chunk_begins, chunk_ends):
chunk_type = np.random.randint(self.num_chunk_types)
chunks.append(Segment(chunk_type, *chunk_pos))
return chunks
def gen_chunks(self, infer, label, starts):
chunks = self.rand_chunks(starts,
self.num_infer_chunks + self.num_label_chunks
- self.num_correct_chunks)
correct_chunks = np.random.choice(
range(len(chunks)), self.num_correct_chunks, replace=False)
infer_chunks = np.random.choice(
[x for x in range(len(chunks)) if x not in correct_chunks],
self.num_infer_chunks - self.num_correct_chunks,
replace=False)
infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
label_chunks = np.random.choice(
[x for x in range(len(chunks)) if x not in infer_chunks],
self.num_label_chunks - self.num_correct_chunks,
replace=False)
label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
# exclude types in excluded_chunk_types
if len(self.excluded_chunk_types) > 0:
for idx in correct_chunks:
if chunks[idx].chunk_type in self.excluded_chunk_types:
self.num_correct_chunks -= 1
for idx in infer_chunks:
if chunks[idx].chunk_type in self.excluded_chunk_types:
self.num_infer_chunks -= 1
for idx in label_chunks:
if chunks[idx].chunk_type in self.excluded_chunk_types:
self.num_label_chunks -= 1
return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks
def set_confs(self):
# Use the IOB scheme and labels with 2 chunk types
self.scheme = 'IOB'
self.num_chunk_types = 2
self.excluded_chunk_types = []
self.other_chunk_type = self.num_chunk_types
self.attrs = {
'num_chunk_types': self.num_chunk_types,
'chunk_scheme': self.scheme,
'excluded_chunk_types': self.excluded_chunk_types
}
self.parse_scheme()
self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
def set_data(self):
infer = np.zeros((self.batch_size, )).astype('int64')
infer.fill(self.num_chunk_types * self.num_tag_types)
label = np.copy(infer)
starts = np.random.choice(
range(1, self.batch_size), self.num_sequences - 1,
replace=False).tolist()
starts.extend([0, self.batch_size])
starts = sorted(starts)
self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
infer, label, starts)
self.inputs = {
'Inference': (infer, [starts]),
'Label': (label, [starts])
}
precision = float(
self.num_correct_chunks
) / self.num_infer_chunks if self.num_infer_chunks else 0
recall = float(self.num_correct_chunks
) / self.num_label_chunks if self.num_label_chunks else 0
f1 = float(2 * precision * recall) / (
precision + recall) if self.num_correct_chunks else 0
self.outputs = {
'Precision': np.asarray(
[precision], dtype='float32'),
'Recall': np.asarray(
[recall], dtype='float32'),
'F1-Score': np.asarray(
[f1], dtype='float32'),
'NumInferChunks': np.asarray(
[self.num_infer_chunks], dtype='int64'),
'NumLabelChunks': np.asarray(
[self.num_label_chunks], dtype='int64'),
'NumCorrectChunks': np.asarray(
[self.num_correct_chunks], dtype='int64')
}
def setUp(self):
self.op_type = 'chunk_eval'
self.set_confs()
self.set_data()
def test_check_output(self):
self.check_output()
class TestChunkEvalOpWithExclude(TestChunkEvalOp):
def set_confs(self):
# Use the IOE scheme and labels with 3 chunk types
self.scheme = 'IOE'
self.num_chunk_types = 3
self.excluded_chunk_types = [1]
self.other_chunk_type = self.num_chunk_types
self.attrs = {
'num_chunk_types': self.num_chunk_types,
'chunk_scheme': self.scheme,
'excluded_chunk_types': self.excluded_chunk_types
}
self.parse_scheme()
self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20
if __name__ == '__main__':
unittest.main()