Paddle/python/paddle/v2/dataset/imdb.py

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
IMDB dataset.

This module downloads IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary.
"""

import paddle.v2.dataset.common
import collections
import tarfile
import re
import string

__all__ = ['build_dict', 'train', 'test', 'convert']

URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'


def tokenize(pattern):
    """
    Read files that match the given pattern.  Tokenize and yield each file.
    """

    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
                                                        MD5)) as tarf:
        # Note that we should use tarfile.next(), which does
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
        tf = tarf.next()
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
                    None, string.punctuation).lower().split()
            tf = tarf.next()


def build_dict(pattern, cutoff):
    """
    Build a word dictionary from the corpus. Keys of the dictionary are words,
    and values are zero-based IDs of these words.
    """
    word_freq = collections.defaultdict(int)
    for doc in tokenize(pattern):
        for word in doc:
            word_freq[word] += 1

    # Not sure if we should prune less-frequent words here.
    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())

    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
    word_idx = dict(zip(words, xrange(len(words))))
    word_idx['<unk>'] = len(words)
    return word_idx


def reader_creator(pos_pattern, neg_pattern, word_idx):
    UNK = word_idx['<unk>']
    INS = []

    def load(pattern, out, label):
        for doc in tokenize(pattern):
            out.append(([word_idx.get(w, UNK) for w in doc], label))

    load(pos_pattern, INS, 0)
    load(neg_pattern, INS, 1)

    def reader():
        for doc, label in INS:
            yield doc, label

    return reader


def train(word_idx):
    """
    IMDB training set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        re.compile("aclImdb/train/pos/.*\.txt$"),
        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)


def test(word_idx):
    """
    IMDB test set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Test reader creator
    :rtype: callable
    """
    return reader_creator(
        re.compile("aclImdb/test/pos/.*\.txt$"),
        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)


def word_dict(cutoff=150):
    """
    Build a word dictionary from the corpus.

    :return: Word dictionary
    :rtype: dict
    """
    return build_dict(
        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), cutoff)


def fetch():
    paddle.v2.dataset.common.download(URL, 'imdb', MD5)


def convert(path):
    """
    Converts dataset to recordio format
    """
    w = word_dict()
    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
Add early draft of imdb.py and unit test 8 years ago			`# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""`
add doc for some v2/dataset 8 years ago			`IMDB dataset.`
Complete documentation for v2. 8 years ago
follow comments 8 years ago			`This module downloads IMDB dataset from`
			`http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set`
			`of 25,000 highly polar movie reviews for training, and 25,000 for testing.`
			`Besides, this module also provides API for building dictionary.`
Add early draft of imdb.py and unit test 8 years ago			`"""`
add copyright 8 years ago
Add imdb and unit test 8 years ago			`import paddle.v2.dataset.common`
Remove unecessary code to generate freq_dict. 8 years ago			`import collections`
Add imdb and unit test 8 years ago			`import tarfile`
			`import re`
			`import string`

fix bugs 8 years ago			`__all__ = ['build_dict', 'train', 'test', 'convert']`
Add imdb and unit test 8 years ago
			`URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'`
			`MD5 = '7c2ac02c03563afcf9b574c7e56c153a'`


			`def tokenize(pattern):`
add doc for some v2/dataset 8 years ago			`"""`
follow comments 8 years ago			`Read files that match the given pattern. Tokenize and yield each file.`
add doc for some v2/dataset 8 years ago			`"""`

Add imdb and unit test 8 years ago			`with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',`
			`MD5)) as tarf:`
			`# Note that we should use tarfile.next(), which does`
			`# sequential access of member files, other than`
			`# tarfile.extractfile, which does random access and might`
			`# destroy hard disks.`
			`tf = tarf.next()`
			`while tf != None:`
			`if bool(pattern.match(tf.name)):`
			`# newline and punctuations removal and ad-hoc tokenization.`
			`yield tarf.extractfile(tf).read().rstrip("\n\r").translate(`
			`None, string.punctuation).lower().split()`
			`tf = tarf.next()`


			`def build_dict(pattern, cutoff):`
add doc for some v2/dataset 8 years ago			`"""`
follow comments 8 years ago			`Build a word dictionary from the corpus. Keys of the dictionary are words,`
			`and values are zero-based IDs of these words.`
add doc for some v2/dataset 8 years ago			`"""`
Remove unecessary code to generate freq_dict. 8 years ago			`word_freq = collections.defaultdict(int)`
Add imdb and unit test 8 years ago			`for doc in tokenize(pattern):`
			`for word in doc:`
Remove unecessary code to generate freq_dict. 8 years ago			`word_freq[word] += 1`
Add imdb and unit test 8 years ago
			`# Not sure if we should prune less-frequent words here.`
			`word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())`

			`dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))`
			`words, _ = list(zip(*dictionary))`
			`word_idx = dict(zip(words, xrange(len(words))))`
			`word_idx['<unk>'] = len(words)`
			`return word_idx`


Remove the buffer size. 7 years ago			`def reader_creator(pos_pattern, neg_pattern, word_idx):`
Add imdb and unit test 8 years ago			`UNK = word_idx['<unk>']`
Speed data reader for imdb dataset. 7 years ago			`INS = []`
Add imdb and unit test 8 years ago
Speed data reader for imdb dataset. 7 years ago			`def load(pattern, out, label):`
Add imdb and unit test 8 years ago			`for doc in tokenize(pattern):`
Speed data reader for imdb dataset. 7 years ago			`out.append(([word_idx.get(w, UNK) for w in doc], label))`

			`load(pos_pattern, INS, 0)`
			`load(neg_pattern, INS, 1)`
Add imdb and unit test 8 years ago
			`def reader():`
Speed data reader for imdb dataset. 7 years ago			`for doc, label in INS:`
			`yield doc, label`
Add imdb and unit test 8 years ago
fix bug (#5233) 7 years ago			`return reader`
Add imdb and unit test 8 years ago

			`def train(word_idx):`
add doc for some v2/dataset 8 years ago			`"""`
fix typo error 8 years ago			`IMDB training set creator.`
add doc for some v2/dataset 8 years ago
follow comments 8 years ago			`It returns a reader creator, each sample in the reader is an zero-based ID`
add doc for some v2/dataset 8 years ago			`sequence and label in [0, 1].`

			`:param word_idx: word dictionary`
			`:type word_idx: dict`
fix typo error 8 years ago			`:return: Training reader creator`
add doc for some v2/dataset 8 years ago			`:rtype: callable`
			`"""`
Add imdb and unit test 8 years ago			`return reader_creator(`
			`re.compile("aclImdb/train/pos/.*\.txt$"),`
Remove the buffer size. 7 years ago			`re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)`
Add imdb and unit test 8 years ago

			`def test(word_idx):`
add doc for some v2/dataset 8 years ago			`"""`
			`IMDB test set creator.`

follow comments 8 years ago			`It returns a reader creator, each sample in the reader is an zero-based ID`
add doc for some v2/dataset 8 years ago			`sequence and label in [0, 1].`

			`:param word_idx: word dictionary`
			`:type word_idx: dict`
			`:return: Test reader creator`
			`:rtype: callable`
			`"""`
Add imdb and unit test 8 years ago			`return reader_creator(`
			`re.compile("aclImdb/test/pos/.*\.txt$"),`
Remove the buffer size. 7 years ago			`re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)`
Use reader in dataset imdb.py 8 years ago

Add cutoff parameter to build_dict. 7 years ago			`def word_dict(cutoff=150):`
add doc for some v2/dataset 8 years ago			`"""`
follow comments 8 years ago			`Build a word dictionary from the corpus.`
add doc for some v2/dataset 8 years ago
			`:return: Word dictionary`
			`:rtype: dict`
			`"""`
Use reader in dataset imdb.py 8 years ago			`return build_dict(`
Add cutoff parameter to build_dict. 7 years ago			`re.compile("aclImdb/((train)\|(test))/((pos)\|(neg))/.*\.txt$"), cutoff)`
add download api for dataset 8 years ago

rename fetch_all to fetch; add fetch_all function 8 years ago			`def fetch():`
add download api for dataset 8 years ago			`paddle.v2.dataset.common.download(URL, 'imdb', MD5)`
add convert function 8 years ago

fix bugs 8 years ago			`def convert(path):`
add convert function 8 years ago			`"""`
			`Converts dataset to recordio format`
			`"""`
fix bugs 8 years ago			`w = word_dict()`
convert dataset into recordio format 8 years ago			`paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")`
			`paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")`