You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
220 lines
6.4 KiB
220 lines
6.4 KiB
# -*- coding: UTF-8 -*-
|
|
|
|
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
1. (remove HTML before or not)tokensizing
|
|
2. pos sample : rating score 5; neg sample: rating score 1-2.
|
|
|
|
Usage:
|
|
python preprocess.py -i data_file [random seed]
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import operator
|
|
import gzip
|
|
from subprocess import Popen, PIPE
|
|
from optparse import OptionParser
|
|
import json
|
|
from multiprocessing import Queue
|
|
from multiprocessing import Pool
|
|
import multiprocessing
|
|
|
|
batch_size = 5000
|
|
word_count = {}
|
|
num_tokenize = max(1, multiprocessing.cpu_count() - 2) # parse + tokenize + save
|
|
max_queue_size = 8
|
|
parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
|
|
tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
|
|
|
|
|
|
def create_dict(data):
|
|
"""
|
|
Create dictionary based on data, and saved in data_dir/dict.txt.
|
|
The first line is unk \t -1.
|
|
data: list, input data by batch.
|
|
"""
|
|
for seq in data:
|
|
try:
|
|
for w in seq.lower().split():
|
|
if w not in word_count:
|
|
word_count[w] = 1
|
|
else:
|
|
word_count[w] += 1
|
|
except:
|
|
sys.stderr.write(seq + "\tERROR\n")
|
|
|
|
|
|
def parse(path):
|
|
"""
|
|
Open .gz file.
|
|
"""
|
|
sys.stderr.write(path)
|
|
g = gzip.open(path, 'r')
|
|
for l in g:
|
|
yield json.loads(l)
|
|
g.close()
|
|
|
|
|
|
def tokenize(sentences):
|
|
"""
|
|
Use tokenizer.perl to tokenize input sentences.
|
|
tokenizer.perl is tool of Moses.
|
|
sentences : a list of input sentences.
|
|
return: a list of processed text.
|
|
"""
|
|
dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
|
|
tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
|
|
assert isinstance(sentences, list)
|
|
text = "\n".join(sentences)
|
|
tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
|
|
tok_text, _ = tokenizer.communicate(text)
|
|
toks = tok_text.split('\n')[:-1]
|
|
return toks
|
|
|
|
|
|
def save_data(instance, data_dir, pre_fix, batch_num):
|
|
"""
|
|
save data by batch
|
|
"""
|
|
label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]
|
|
lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))]
|
|
file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))
|
|
file(file_name, 'w').write('\n'.join(lines) + '\n')
|
|
|
|
|
|
def tokenize_batch(id):
|
|
"""
|
|
tokenize data by batch
|
|
"""
|
|
while True:
|
|
num_batch, instance, pre_fix = parse_queue.get()
|
|
if num_batch == -1: ### parse_queue finished
|
|
tokenize_queue.put((-1, None, None))
|
|
sys.stderr.write("tokenize theread %s finish\n" % (id))
|
|
break
|
|
tokenize_instance = tokenize(instance)
|
|
tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
|
|
sys.stderr.write('.')
|
|
|
|
|
|
def save_batch(data_dir, num_tokenize, data_dir_dict):
|
|
"""
|
|
save data by batch
|
|
build dict.txt
|
|
"""
|
|
token_count = 0
|
|
while True:
|
|
num_batch, instance, pre_fix = tokenize_queue.get()
|
|
if num_batch == -1:
|
|
token_count += 1
|
|
if token_count == num_tokenize: #### tokenize finished.
|
|
break
|
|
else:
|
|
continue
|
|
save_data(instance, data_dir, pre_fix, num_batch)
|
|
create_dict(instance) ## update dict
|
|
|
|
sys.stderr.write("save file finish\n")
|
|
f = open(data_dir_dict, 'w')
|
|
f.write('%s\t%s\n' % ('unk', '-1'))
|
|
for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \
|
|
reverse=True):
|
|
f.write('%s\t%s\n' % (k, v))
|
|
f.close()
|
|
sys.stderr.write("build dict finish\n")
|
|
|
|
|
|
def parse_batch(data, num_tokenize):
|
|
"""
|
|
parse data by batch
|
|
parse -> tokenize -> save
|
|
"""
|
|
raw_txt = parse(data)
|
|
neg, pos = [], []
|
|
count = 0
|
|
sys.stderr.write("extract raw data\n")
|
|
for l in raw_txt:
|
|
rating = l["overall"]
|
|
text = l["reviewText"].lower() # # convert words to lower case
|
|
if rating == 5.0 and text:
|
|
pos.append(text)
|
|
if rating < 3.0 and text:
|
|
neg.append(text)
|
|
if len(pos) == batch_size or len(neg) == batch_size:
|
|
if len(pos) == batch_size:
|
|
batch = pos
|
|
pre_fix = 'pos'
|
|
else:
|
|
batch = neg
|
|
pre_fix = 'neg'
|
|
|
|
parse_queue.put((count, batch, pre_fix))
|
|
count += 1
|
|
if pre_fix == 'pos':
|
|
pos = []
|
|
else:
|
|
neg = []
|
|
|
|
if len(pos) > 0:
|
|
parse_queue.put((count, pos, 'pos'))
|
|
count += 1
|
|
if len(neg) > 0:
|
|
parse_queue.put((count, neg, 'neg'))
|
|
count += 1
|
|
for i in range(num_tokenize):
|
|
parse_queue.put((-1, None, None)) #### for tokenize's input finished
|
|
sys.stderr.write("parsing finish\n")
|
|
|
|
|
|
def option_parser():
|
|
parser = OptionParser(usage="usage: python preprcoess.py "\
|
|
"-i data_path [options]")
|
|
parser.add_option(
|
|
"-i", "--data", action="store", dest="input", help="Input data path.")
|
|
parser.add_option(
|
|
"-s",
|
|
"--seed",
|
|
action="store",
|
|
dest="seed",
|
|
default=1024,
|
|
help="Set random seed.")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
reload(sys)
|
|
sys.setdefaultencoding('utf-8')
|
|
options, args = option_parser()
|
|
data = options.input
|
|
seed = options.seed
|
|
data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')
|
|
data_dir = os.path.join(os.path.dirname(data), 'tmp')
|
|
pool = Pool(processes=num_tokenize + 2)
|
|
pool.apply_async(parse_batch, args=(data, num_tokenize))
|
|
for i in range(num_tokenize):
|
|
pool.apply_async(tokenize_batch, args=(str(i), ))
|
|
pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))
|
|
pool.close()
|
|
pool.join()
|
|
|
|
file(os.path.join(os.path.dirname(data), 'labels.list'),
|
|
'w').write('neg\t0\npos\t1\n')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|