Paddle/demo/quick_start/preprocess.py

# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
1. remove HTML before tokensizing 
2. pos sample : rating score 5; neg sample: rating score 1-2.
3. size of pos : neg = 1:1.
4. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
5. distinct train set and test set.

Usage:
    python preprocess.py -i data_file [random seed]
'''

import sys,os
import re
import operator
import gzip,math
import random
import numpy as np
from bs4 import BeautifulSoup
from subprocess import Popen, PIPE
from optparse import OptionParser

def parse(path):
    """
    Open .gz file.
    """
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

def clean(review):
    """
    Clean input review: remove HTML, convert words to lower cases.
    """
    # Remove HTML
    review_text = BeautifulSoup(review, "html.parser").get_text()

    # Convert words to lower case
    review_text = review_text.lower()
    return review_text

def tokenize(sentences):
    """
    Use tokenizer.perl to tokenize input sentences.
    tokenizer.perl is tool of Moses.
    sentences : a list of input sentences.
    return: a list of processed text.
    """
    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
    assert isinstance(sentences, list)
    text = "\n".join(sentences)
    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
    tok_text, _ = tokenizer.communicate(text)
    toks = tok_text.split('\n')[:-1]
    return toks

def create_dict(data, data_dir):
    """
    Create dictionary based on data, and saved in data_dir/dict.txt.
    The first line is unk \t -1. 
    data: list, input data.
    data_dir: path to save dict.
    """
    word_count = {}
    for seq in data:
        try:
            for w in seq.lower().split():
                if w not in word_count:
                    word_count[w] = 1
                else:
                    word_count[w] += 1
        except:
            sys.stderr.write(seq+"\tERROR\n")
    f = open(os.path.join(data_dir, 'dict.txt'), 'w')
    f.write('%s\t%s\n' % ('unk', '-1'))
    for k, v in sorted(word_count.items(), key=operator.itemgetter(1),\
                      reverse=True):
        f.write('%s\t%s\n' % (k, v))
    f.close()

def save_data(data, data_dir, prefix = ""):
    file_name = os.path.join(data_dir, "%s.txt" % (prefix))
    file(file_name,'w').write('\n'.join(data)+'\n')
    file(os.path.join(data_dir, prefix+'.list'),'w').write('%s\n' % file_name)

def split_data(raw_txt):
    """
    Extract positive and negative sample.
    """
    pos = []
    neg = []
    count = 0
    dup_cnt = 0
    sys.stderr.write("extract raw data")
    for l in raw_txt:
        rating = l["overall"]
        text = clean(l["reviewText"])
        if rating == 5.0 and text:
            pos.append(text)
        if rating < 3.0 and text:
            neg.append(text)
        count += 1
        if count % 20000==0:
            sys.stderr.write(".")
    sys.stderr.write("\n")
    return pos, neg

def preprocess(pos_in, neg_in, data_dir, rand_seed):
    # tokenize
    sys.stderr.write("tokenize...\n")
    tmppos = tokenize(pos_in)
    tmpneg = tokenize(neg_in)
    cnt = len(tmppos) + len(tmpneg)

    # unique smaples
    tmppos = list(set(tmppos))
    tmpneg = list(set(tmpneg))
    dup_cnt = cnt - len(tmppos) - len(tmpneg)
    sys.stderr.write("\ntotal size of data set: %d, duplicate data: %d\n" % (cnt, dup_cnt))

    # keep same size of positive and negative sample
    min_len = min(len(tmppos), len(tmpneg))
    tmppos = tmppos[0:min_len]
    tmpneg = tmpneg[0:min_len]

    # creat dictionary
    sys.stderr.write("create dict with train and test data...\n")
    all_data = tmppos + tmpneg
    create_dict(all_data, data_dir)

    # split into train set and test set
    sys.stderr.write("split data...\n")
    pos = ["1\t"+i for i in tmppos]
    neg = ["0\t"+i for i in tmpneg]
    random.seed(rand_seed)
    random.shuffle(pos)
    random.shuffle(neg)

    # split into test set and train set
    test_len = min(12500, int(min_len * 0.1))
    test = pos[0:test_len] + neg[0:test_len]
    train = pos[test_len:] + neg[test_len:]

    # save data
    sys.stderr.write("save data...\n")
    save_data(train, data_dir, prefix = 'train')
    save_data(test, data_dir, prefix = 'test')
    file(os.path.join(data_dir,'labels.list'),'w').write('neg\t0\npos\t1\n')

def option_parser():
    parser = OptionParser(usage="usage: python preprcoess.py "\
                                "-i data_path [options]")
    parser.add_option("-i", "--data", action="store",
                      dest="input", help="Input data path.")
    parser.add_option("-s", "--seed", action="store",
                      dest="seed", default=1024,
                      help="Set random seed.")
    return parser.parse_args()

def main():
    reload(sys)
    sys.setdefaultencoding('utf-8')
    options, args = option_parser()
    data=options.input
    seed=options.seed
    data_dir = os.path.dirname(data)
    pos, neg = split_data(parse(data))
    preprocess(pos, neg, data_dir, seed)
    sys.stderr.write("Done.\n")

if __name__ == '__main__':
    main()
fix dash and space bug, ISSUE=4586495 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1408 1ad973e4-5ce8-4261-8a94-b56d1f490c56 9 years ago			`# Copyright (c) 2016 Baidu, Inc. All Rights Reserved`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`'''`
			`1. remove HTML before tokensizing`
			`2. pos sample : rating score 5; neg sample: rating score 1-2.`
			`3. size of pos : neg = 1:1.`
			`4. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.`
			`5. distinct train set and test set.`

			`Usage:`
			`python preprocess.py -i data_file [random seed]`
			`'''`

			`import sys,os`
			`import re`
			`import operator`
			`import gzip,math`
			`import random`
			`import numpy as np`
			`from bs4 import BeautifulSoup`
			`from subprocess import Popen, PIPE`
			`from optparse import OptionParser`

			`def parse(path):`
			`"""`
			`Open .gz file.`
			`"""`
			`g = gzip.open(path, 'r')`
			`for l in g:`
			`yield eval(l)`

			`def clean(review):`
			`"""`
			`Clean input review: remove HTML, convert words to lower cases.`
			`"""`
			`# Remove HTML`
			`review_text = BeautifulSoup(review, "html.parser").get_text()`

			`# Convert words to lower case`
			`review_text = review_text.lower()`
			`return review_text`

			`def tokenize(sentences):`
			`"""`
			`Use tokenizer.perl to tokenize input sentences.`
			`tokenizer.perl is tool of Moses.`
			`sentences : a list of input sentences.`
			`return: a list of processed text.`
			`"""`
			`dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'`
			`tokenizer_cmd = [dir, '-l', 'en', '-q', '-']`
			`assert isinstance(sentences, list)`
			`text = "\n".join(sentences)`
			`tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)`
			`tok_text, _ = tokenizer.communicate(text)`
			`toks = tok_text.split('\n')[:-1]`
			`return toks`

			`def create_dict(data, data_dir):`
			`"""`
			`Create dictionary based on data, and saved in data_dir/dict.txt.`
			`The first line is unk \t -1.`
			`data: list, input data.`
			`data_dir: path to save dict.`
			`"""`
			`word_count = {}`
			`for seq in data:`
			`try:`
			`for w in seq.lower().split():`
			`if w not in word_count:`
			`word_count[w] = 1`
			`else:`
			`word_count[w] += 1`
			`except:`
			`sys.stderr.write(seq+"\tERROR\n")`
			`f = open(os.path.join(data_dir, 'dict.txt'), 'w')`
			`f.write('%s\t%s\n' % ('unk', '-1'))`
			`for k, v in sorted(word_count.items(), key=operator.itemgetter(1),\`
			`reverse=True):`
			`f.write('%s\t%s\n' % (k, v))`
			`f.close()`

			`def save_data(data, data_dir, prefix = ""):`
			`file_name = os.path.join(data_dir, "%s.txt" % (prefix))`
			`file(file_name,'w').write('\n'.join(data)+'\n')`
			`file(os.path.join(data_dir, prefix+'.list'),'w').write('%s\n' % file_name)`

			`def split_data(raw_txt):`
			`"""`
			`Extract positive and negative sample.`
			`"""`
			`pos = []`
			`neg = []`
			`count = 0`
			`dup_cnt = 0`
			`sys.stderr.write("extract raw data")`
			`for l in raw_txt:`
			`rating = l["overall"]`
			`text = clean(l["reviewText"])`
			`if rating == 5.0 and text:`
			`pos.append(text)`
			`if rating < 3.0 and text:`
			`neg.append(text)`
			`count += 1`
			`if count % 20000==0:`
			`sys.stderr.write(".")`
			`sys.stderr.write("\n")`
			`return pos, neg`

			`def preprocess(pos_in, neg_in, data_dir, rand_seed):`
			`# tokenize`
			`sys.stderr.write("tokenize...\n")`
			`tmppos = tokenize(pos_in)`
			`tmpneg = tokenize(neg_in)`
			`cnt = len(tmppos) + len(tmpneg)`

			`# unique smaples`
			`tmppos = list(set(tmppos))`
			`tmpneg = list(set(tmpneg))`
			`dup_cnt = cnt - len(tmppos) - len(tmpneg)`
			`sys.stderr.write("\ntotal size of data set: %d, duplicate data: %d\n" % (cnt, dup_cnt))`

			`# keep same size of positive and negative sample`
			`min_len = min(len(tmppos), len(tmpneg))`
			`tmppos = tmppos[0:min_len]`
			`tmpneg = tmpneg[0:min_len]`

			`# creat dictionary`
			`sys.stderr.write("create dict with train and test data...\n")`
			`all_data = tmppos + tmpneg`
			`create_dict(all_data, data_dir)`

			`# split into train set and test set`
			`sys.stderr.write("split data...\n")`
			`pos = ["1\t"+i for i in tmppos]`
			`neg = ["0\t"+i for i in tmpneg]`
			`random.seed(rand_seed)`
			`random.shuffle(pos)`
			`random.shuffle(neg)`

			`# split into test set and train set`
			`test_len = min(12500, int(min_len * 0.1))`
			`test = pos[0:test_len] + neg[0:test_len]`
			`train = pos[test_len:] + neg[test_len:]`

			`# save data`
			`sys.stderr.write("save data...\n")`
			`save_data(train, data_dir, prefix = 'train')`
			`save_data(test, data_dir, prefix = 'test')`
			`file(os.path.join(data_dir,'labels.list'),'w').write('neg\t0\npos\t1\n')`

			`def option_parser():`
			`parser = OptionParser(usage="usage: python preprcoess.py "\`
			`"-i data_path [options]")`
			`parser.add_option("-i", "--data", action="store",`
			`dest="input", help="Input data path.")`
			`parser.add_option("-s", "--seed", action="store",`
			`dest="seed", default=1024,`
			`help="Set random seed.")`
			`return parser.parse_args()`

			`def main():`
			`reload(sys)`
			`sys.setdefaultencoding('utf-8')`
			`options, args = option_parser()`
			`data=options.input`
			`seed=options.seed`
			`data_dir = os.path.dirname(data)`
			`pos, neg = split_data(parse(data))`
			`preprocess(pos, neg, data_dir, seed)`
			`sys.stderr.write("Done.\n")`

			`if __name__ == '__main__':`
			`main()`