Update conll05.py

the label file in the conll05 dataset has a wrong order
fix_conll05_bug
jiaozhenyu 7 years ago committed by GitHub
parent c490f1b38b
commit 0d8192fbec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -41,6 +41,28 @@ EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0
def load_label_dict(filename):
d = dict()
tag_dict = set()
with open(filename, 'r') as f:
for i, line in enumerate(f):
line = line.strip()
if line.startswith("B-"):
tag_dict.add(line[2:])
elif line.startswith("I-"):
tag_dict.add(line[2:])
else:
continue
index = 0
for tag in tag_dict:
d["B-" + tag] = index
index += 1
d["I-" + tag] = index
index += 1
d["O"] = index
return d
def load_dict(filename):
d = dict()
with open(filename, 'r') as f:
@ -188,7 +210,7 @@ def get_dict():
verb_dict = load_dict(
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
VERBDICT_MD5))
label_dict = load_dict(
label_dict = load_label_dict(
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
TRGDICT_MD5))
return word_dict, verb_dict, label_dict

Loading…
Cancel
Save