You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/demo/quick_start/preprocess.py

187 lines
5.6 KiB

# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
1. remove HTML before tokensizing
2. pos sample : rating score 5; neg sample: rating score 1-2.
3. size of pos : neg = 1:1.
4. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
5. distinct train set and test set.
Usage:
python preprocess.py -i data_file [random seed]
'''
import sys,os
import re
import operator
import gzip,math
import random
import numpy as np
from bs4 import BeautifulSoup
from subprocess import Popen, PIPE
from optparse import OptionParser
def parse(path):
"""
Open .gz file.
"""
g = gzip.open(path, 'r')
for l in g:
yield eval(l)
def clean(review):
"""
Clean input review: remove HTML, convert words to lower cases.
"""
# Remove HTML
review_text = BeautifulSoup(review, "html.parser").get_text()
# Convert words to lower case
review_text = review_text.lower()
return review_text
def tokenize(sentences):
"""
Use tokenizer.perl to tokenize input sentences.
tokenizer.perl is tool of Moses.
sentences : a list of input sentences.
return: a list of processed text.
"""
dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
assert isinstance(sentences, list)
text = "\n".join(sentences)
tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
tok_text, _ = tokenizer.communicate(text)
toks = tok_text.split('\n')[:-1]
return toks
def create_dict(data, data_dir):
"""
Create dictionary based on data, and saved in data_dir/dict.txt.
The first line is unk \t -1.
data: list, input data.
data_dir: path to save dict.
"""
word_count = {}
for seq in data:
try:
for w in seq.lower().split():
if w not in word_count:
word_count[w] = 1
else:
word_count[w] += 1
except:
sys.stderr.write(seq+"\tERROR\n")
f = open(os.path.join(data_dir, 'dict.txt'), 'w')
f.write('%s\t%s\n' % ('unk', '-1'))
for k, v in sorted(word_count.items(), key=operator.itemgetter(1),\
reverse=True):
f.write('%s\t%s\n' % (k, v))
f.close()
def save_data(data, data_dir, prefix = ""):
file_name = os.path.join(data_dir, "%s.txt" % (prefix))
file(file_name,'w').write('\n'.join(data)+'\n')
file(os.path.join(data_dir, prefix+'.list'),'w').write('%s\n' % file_name)
def split_data(raw_txt):
"""
Extract positive and negative sample.
"""
pos = []
neg = []
count = 0
dup_cnt = 0
sys.stderr.write("extract raw data")
for l in raw_txt:
rating = l["overall"]
text = clean(l["reviewText"])
if rating == 5.0 and text:
pos.append(text)
if rating < 3.0 and text:
neg.append(text)
count += 1
if count % 20000==0:
sys.stderr.write(".")
sys.stderr.write("\n")
return pos, neg
def preprocess(pos_in, neg_in, data_dir, rand_seed):
# tokenize
sys.stderr.write("tokenize...\n")
tmppos = tokenize(pos_in)
tmpneg = tokenize(neg_in)
cnt = len(tmppos) + len(tmpneg)
# unique smaples
tmppos = list(set(tmppos))
tmpneg = list(set(tmpneg))
dup_cnt = cnt - len(tmppos) - len(tmpneg)
sys.stderr.write("\ntotal size of data set: %d, duplicate data: %d\n" % (cnt, dup_cnt))
# keep same size of positive and negative sample
min_len = min(len(tmppos), len(tmpneg))
tmppos = tmppos[0:min_len]
tmpneg = tmpneg[0:min_len]
# creat dictionary
sys.stderr.write("create dict with train and test data...\n")
all_data = tmppos + tmpneg
create_dict(all_data, data_dir)
# split into train set and test set
sys.stderr.write("split data...\n")
pos = ["1\t"+i for i in tmppos]
neg = ["0\t"+i for i in tmpneg]
random.seed(rand_seed)
random.shuffle(pos)
random.shuffle(neg)
# split into test set and train set
test_len = min(12500, int(min_len * 0.1))
test = pos[0:test_len] + neg[0:test_len]
train = pos[test_len:] + neg[test_len:]
# save data
sys.stderr.write("save data...\n")
save_data(train, data_dir, prefix = 'train')
save_data(test, data_dir, prefix = 'test')
file(os.path.join(data_dir,'labels.list'),'w').write('neg\t0\npos\t1\n')
def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_path [options]")
parser.add_option("-i", "--data", action="store",
dest="input", help="Input data path.")
parser.add_option("-s", "--seed", action="store",
dest="seed", default=1024,
help="Set random seed.")
return parser.parse_args()
def main():
reload(sys)
sys.setdefaultencoding('utf-8')
options, args = option_parser()
data=options.input
seed=options.seed
data_dir = os.path.dirname(data)
pos, neg = split_data(parse(data))
preprocess(pos, neg, data_dir, seed)
sys.stderr.write("Done.\n")
if __name__ == '__main__':
main()