You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
46 lines
1.5 KiB
46 lines
1.5 KiB
9 years ago
|
from paddle.trainer.PyDataProvider2 import *
|
||
|
|
||
|
|
||
|
def on_init(settings, dictionary, **kwargs):
|
||
|
# on_init will invoke when data provider is initialized. The dictionary
|
||
|
# is passed from trainer_config, and is a dict object with type
|
||
|
# (word string => word id).
|
||
|
|
||
|
# set input types in runtime. It will do the same thing as
|
||
|
# @provider(input_types) will do, but it is set dynamically during runtime.
|
||
|
settings.input_types = [
|
||
|
# The text is a sequence of integer values, and each value is a word id.
|
||
|
# The whole sequence is the sentences that we want to predict its
|
||
|
# sentimental.
|
||
|
integer_value(len(dictionary), seq_type=SequenceType), # text input
|
||
|
|
||
|
# label positive/negative
|
||
|
integer_value(2)
|
||
|
]
|
||
|
|
||
|
# save dictionary as settings.dictionary. It will be used in process
|
||
|
# method.
|
||
|
settings.dictionary = dictionary
|
||
|
|
||
|
|
||
|
@provider(init_hook=on_init)
|
||
|
def process(settings, filename):
|
||
|
f = open(filename, 'r')
|
||
|
|
||
|
for line in f: # read each line of file
|
||
|
label, sentence = line.split('\t') # get label and sentence
|
||
|
words = sentence.split(' ') # get words
|
||
|
|
||
|
# convert word string to word id
|
||
|
# the word not in dictionary will be ignored.
|
||
|
word_ids = []
|
||
|
|
||
|
for each_word in words:
|
||
|
if each_word in settings.dictionary:
|
||
|
word_ids.append(settings.dictionary[each_word])
|
||
|
|
||
|
# give data to paddle.
|
||
|
yield word_ids, int(label)
|
||
|
|
||
|
f.close()
|