add paddle.v2.reader,dataset back for backward compatibility

7 years ago · f837eee724
parent 991b582efc
commit f837eee724
45 changed files with 4718 additions and 3 deletions
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@ -81,6 +81,7 @@ if (WITH_TESTING)
      # enable v2 API unittest only when paddle swig api is compiled
      add_subdirectory(paddle/v2/tests)
      add_subdirectory(paddle/v2/plot/tests)
      add_subdirectory(paddle/v2/reader/tests)
    endif()
  endif()
  add_subdirectory(paddle/fluid/tests)
--- a/python/paddle/dataset/init.py
+++ b/python/paddle/dataset/init.py
@ -37,7 +37,7 @@ __all__ = [
    'cifar',
    'movielens',
    'conll05',
-    'sentiment'
+    'sentiment',
    'uci_housing',
    'wmt14',
    'wmt16',
--- a/python/paddle/v2/init.py
+++ b/python/paddle/v2/init.py
@ -22,13 +22,17 @@ import data_type
 import topology
 import networks
 import evaluator
 from . import dataset
 from . import reader
 from . import plot
 import attr
 import op
 import pooling
 import inference
 import networks
 import minibatch
 import plot
 import image
 import paddle.trainer.config_parser as cp
 __all__ = [
@ -44,11 +48,14 @@ __all__ = [
    'data_type',
    'attr',
    'pooling',
    'dataset',
    'reader',
    'topology',
    'networks',
    'infer',
    'plot',
    'evaluator',
    'image',
    'master',
 ]
@ -146,3 +153,4 @@ def init(**kwargs):
 infer = inference.infer
 batch = minibatch.batch
--- a/python/paddle/v2/dataset/init.py
+++ b/python/paddle/v2/dataset/init.py
@ -0,0 +1,46 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Dataset package.
 """
 import mnist
 import imikolov
 import imdb
 import cifar
 import movielens
 import conll05
 import uci_housing
 import sentiment
 import wmt14
 import wmt16
 import mq2007
 import flowers
 import voc2012
 __all__ = [
    'mnist',
    'imikolov',
    'imdb',
    'cifar',
    'movielens',
    'conll05',
    'sentiment'
    'uci_housing',
    'wmt14',
    'wmt16',
    'mq2007',
    'flowers',
    'voc2012',
 ]
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@ -0,0 +1,139 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 CIFAR dataset.
 This module will download dataset from
 https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
 paddle reader creators.
 The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
 with 6000 images per class. There are 50000 training images and 10000 test
 images.
 The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
 containing 600 images each. There are 500 training images and 100 testing
 images per class.
 """
 import cPickle
 import itertools
 import numpy
 import paddle.v2.dataset.common
 import tarfile
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
 CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
 CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 def reader_creator(filename, sub_name):
    def read_batch(batch):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
        assert labels is not None
        for sample, label in itertools.izip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)
    def reader():
        with tarfile.open(filename, mode='r') as f:
            names = (each_item.name for each_item in f
                     if sub_name in each_item.name)
            for name in names:
                batch = cPickle.load(f.extractfile(name))
                for item in read_batch(batch):
                    yield item
    return reader
 def train100():
    """
    CIFAR-100 training set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 99].
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
        'train')
 def test100():
    """
    CIFAR-100 test set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
        'test')
 def train10():
    """
    CIFAR-10 training set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
        'data_batch')
 def test10():
    """
    CIFAR-10 test set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
        'test_batch')
 def fetch():
    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
 def convert(path):
    """
    Converts dataset to recordio format
    """
    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@ -0,0 +1,236 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import requests
 import hashlib
 import os
 import errno
 import shutil
 import sys
 import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
 import cPickle as pickle
 __all__ = [
    'DATA_HOME',
    'download',
    'md5file',
    'split',
    'cluster_files_reader',
    'convert',
 ]
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 # When running unit tests, there could be multiple processes that
 # trying to create DATA_HOME directory simultaneously, so we cannot
 # use a if condition to check for the existence of the directory;
 # instead, we use the filesystem as the synchronization mechanism by
 # catching returned errors.
 def must_mkdirs(path):
    try:
        os.makedirs(DATA_HOME)
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass
 must_mkdirs(DATA_HOME)
 def md5file(fname):
    hash_md5 = hashlib.md5()
    f = open(fname, "rb")
    for chunk in iter(lambda: f.read(4096), b""):
        hash_md5.update(chunk)
    f.close()
    return hash_md5.hexdigest()
 def download(url, module_name, md5sum, save_name=None):
    dirname = os.path.join(DATA_HOME, module_name)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    filename = os.path.join(dirname,
                            url.split('/')[-1]
                            if save_name is None else save_name)
    retry = 0
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
            print "file md5", md5file(filename), md5sum
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
        print "Cache file %s not found, downloading %s" % (filename, url)
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')
        if total_length is None:
            with open(filename, 'w') as f:
                shutil.copyfileobj(r.raw, f)
        else:
            with open(filename, 'w') as f:
                dl = 0
                total_length = int(total_length)
                for data in r.iter_content(chunk_size=4096):
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)
                    sys.stdout.write("\r[%s%s]" % ('=' * done,
                                                   ' ' * (50 - done)))
                    sys.stdout.flush()
    return filename
 def fetch_all():
    for module_name in filter(lambda x: not x.startswith("__"),
                              dir(paddle.v2.dataset)):
        if "fetch" in dir(
                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
            getattr(
                importlib.import_module("paddle.v2.dataset.%s" % module_name),
                "fetch")()
 def fetch_all_recordio(path):
    for module_name in filter(lambda x: not x.startswith("__"),
                              dir(paddle.v2.dataset)):
        if "convert" in dir(
                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
                not module_name == "common":
            ds_path = os.path.join(path, module_name)
            must_mkdirs(ds_path)
            getattr(
                importlib.import_module("paddle.v2.dataset.%s" % module_name),
                "convert")(ds_path)
 def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
    """
    you can call the function as:
    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
        suffix="imikolov-train-%05d.pickle")
    the output files as:
    |-imikolov-train-00000.pickle
    |-imikolov-train-00001.pickle
    |- ...
    |-imikolov-train-00480.pickle
    :param reader: is a reader creator
    :param line_count: line count for each file
    :param suffix: the suffix for the output files, should contain "%d"
                means the id for each file. Default is "%05d.pickle"
    :param dumper: is a callable function that dump object to file, this
                function will be called as dumper(obj, f) and obj is the object
                will be dumped, f is a file object. Default is cPickle.dump.
    """
    if not callable(dumper):
        raise TypeError("dumper should be callable.")
    lines = []
    indx_f = 0
    for i, d in enumerate(reader()):
        lines.append(d)
        if i >= line_count and i % line_count == 0:
            with open(suffix % indx_f, "w") as f:
                dumper(lines, f)
                lines = []
                indx_f += 1
    if lines:
        with open(suffix % indx_f, "w") as f:
            dumper(lines, f)
 def cluster_files_reader(files_pattern,
                         trainer_count,
                         trainer_id,
                         loader=cPickle.load):
    """
    Create a reader that yield element from the given files, select
    a file set according trainer count and trainer_id
    :param files_pattern: the files which generating by split(...)
    :param trainer_count: total trainer count
    :param trainer_id: the trainer rank id
    :param loader: is a callable function that load object from file, this
                function will be called as loader(f) and f is a file object.
                Default is cPickle.load
    """
    def reader():
        if not callable(loader):
            raise TypeError("loader should be callable.")
        file_list = glob.glob(files_pattern)
        file_list.sort()
        my_file_list = []
        for idx, fn in enumerate(file_list):
            if idx % trainer_count == trainer_id:
                print "append file: %s" % fn
                my_file_list.append(fn)
        for fn in my_file_list:
            with open(fn, "r") as f:
                lines = loader(f)
                for line in lines:
                    yield line
    return reader
 def convert(output_path, reader, line_count, name_prefix):
    import recordio
    """
    Convert data from reader to recordio format files.
    :param output_path: directory in which output files will be saved.
    :param reader: a data reader, from which the convert program will read
                   data instances.
    :param name_prefix: the name prefix of generated files.
    :param max_lines_to_shuffle: the max lines numbers to shuffle before
                                 writing.
    """
    assert line_count >= 1
    indx_f = 0
    def write_data(indx_f, lines):
        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
        writer = recordio.writer(filename)
        for l in lines:
            # FIXME(Yancey1989):
            # dumps with protocol: pickle.HIGHEST_PROTOCOL
            writer.write(cPickle.dumps(l))
        writer.close()
    lines = []
    for i, d in enumerate(reader()):
        lines.append(d)
        if i % line_count == 0 and i >= line_count:
            write_data(indx_f, lines)
            lines = []
            indx_f += 1
            continue
    write_data(indx_f, lines)
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@ -0,0 +1,199 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This module will download dataset from
 http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
 and parse train/test set intopaddle reader creators.
 This set contains images of flowers belonging to 102 different categories.
 The images were acquired by searching the web and taking pictures. There are a
 minimum of 40 images for each category.
 The database was used in:
 Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
 number of classes.Proceedings of the Indian Conference on Computer Vision,
 Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 """
 import cPickle
 import itertools
 import functools
 from common import download
 import tarfile
 import scipy.io as scio
 from paddle.v2.image import *
 from paddle.v2.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
 __all__ = ['train', 'test', 'valid']
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
 LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
 SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
 DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
 # and trnid is the flag of train data. But test data is more than train data.
 # So we exchange the train data and test data.
 TRAIN_FLAG = 'tstid'
 TEST_FLAG = 'trnid'
 VALID_FLAG = 'valid'
 def default_mapper(is_train, sample):
    '''
    map image bytes data to type needed by model input layer
    '''
    img, label = sample
    img = load_image_bytes(img)
    img = simple_transform(
        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
    return img.flatten().astype('float32'), label
 train_mapper = functools.partial(default_mapper, True)
 test_mapper = functools.partial(default_mapper, False)
 def reader_creator(data_file,
                   label_file,
                   setid_file,
                   dataset_name,
                   mapper,
                   buffered_size=1024,
                   use_xmap=True):
    '''
    1. read images from tar file and
        merge images into batch files in 102flowers.tgz_batch/
    2. get a reader to read sample from batch file
    :param data_file: downloaded data file
    :type data_file: string
    :param label_file: downloaded label file
    :type label_file: string
    :param setid_file: downloaded setid file containing information
                        about how to split dataset
    :type setid_file: string
    :param dataset_name: data set name (tstid|trnid|valid)
    :type dataset_name: string
    :param mapper: a function to map image bytes data to type
                    needed by model input layer
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :return: data reader
    :rtype: callable
    '''
    labels = scio.loadmat(label_file)['labels'][0]
    indexes = scio.loadmat(setid_file)[dataset_name][0]
    img2label = {}
    for i in indexes:
        img = "jpg/image_%05d.jpg" % i
        img2label[img] = labels[i - 1]
    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
    def reader():
        for file in open(file_list):
            file = file.strip()
            batch = None
            with open(file, 'r') as f:
                batch = cPickle.load(f)
            data = batch['data']
            labels = batch['label']
            for sample, label in itertools.izip(data, batch['label']):
                yield sample, int(label) - 1
    if use_xmap:
        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
    else:
        return map_readers(mapper, reader)
 def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers training set reader.
    It returns a reader, each sample in the reader is
    image pixels in [0, 1] and label in [1, 102]
    translated from original color image by steps:
    1. resize to 256*256
    2. random crop to 224*224
    3. flatten
    :param mapper:  a function to map sample.
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :return: train data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
        buffered_size, use_xmap)
 def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers test set reader.
    It returns a reader, each sample in the reader is
    image pixels in [0, 1] and label in [1, 102]
    translated from original color image by steps:
    1. resize to 256*256
    2. random crop to 224*224
    3. flatten
    :param mapper:  a function to map sample.
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :return: test data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
        buffered_size, use_xmap)
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers validation set reader.
    It returns a reader, each sample in the reader is
    image pixels in [0, 1] and label in [1, 102]
    translated from original color image by steps:
    1. resize to 256*256
    2. random crop to 224*224
    3. flatten
    :param mapper:  a function to map sample.
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :return: test data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
        buffered_size, use_xmap)
 def fetch():
    download(DATA_URL, 'flowers', DATA_MD5)
    download(LABEL_URL, 'flowers', LABEL_MD5)
    download(SETID_URL, 'flowers', SETID_MD5)
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@ -0,0 +1,148 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 IMDB dataset.
 This module downloads IMDB dataset from
 http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
 of 25,000 highly polar movie reviews for training, and 25,000 for testing.
 Besides, this module also provides API for building dictionary.
 """
 import paddle.v2.dataset.common
 import collections
 import tarfile
 import re
 import string
 __all__ = ['build_dict', 'train', 'test', 'convert']
 URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
 def tokenize(pattern):
    """
    Read files that match the given pattern.  Tokenize and yield each file.
    """
    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
                                                        MD5)) as tarf:
        # Note that we should use tarfile.next(), which does
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
        tf = tarf.next()
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
                    None, string.punctuation).lower().split()
            tf = tarf.next()
 def build_dict(pattern, cutoff):
    """
    Build a word dictionary from the corpus. Keys of the dictionary are words,
    and values are zero-based IDs of these words.
    """
    word_freq = collections.defaultdict(int)
    for doc in tokenize(pattern):
        for word in doc:
            word_freq[word] += 1
    # Not sure if we should prune less-frequent words here.
    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
    word_idx = dict(zip(words, xrange(len(words))))
    word_idx['<unk>'] = len(words)
    return word_idx
 def reader_creator(pos_pattern, neg_pattern, word_idx):
    UNK = word_idx['<unk>']
    INS = []
    def load(pattern, out, label):
        for doc in tokenize(pattern):
            out.append(([word_idx.get(w, UNK) for w in doc], label))
    load(pos_pattern, INS, 0)
    load(neg_pattern, INS, 1)
    def reader():
        for doc, label in INS:
            yield doc, label
    return reader
 def train(word_idx):
    """
    IMDB training set creator.
    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].
    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        re.compile("aclImdb/train/pos/.*\.txt$"),
        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
 def test(word_idx):
    """
    IMDB test set creator.
    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].
    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Test reader creator
    :rtype: callable
    """
    return reader_creator(
        re.compile("aclImdb/test/pos/.*\.txt$"),
        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 def word_dict():
    """
    Build a word dictionary from the corpus.
    :return: Word dictionary
    :rtype: dict
    """
    return build_dict(
        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
 def fetch():
    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
 def convert(path):
    """
    Converts dataset to recordio format
    """
    w = word_dict()
    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@ -0,0 +1,161 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 imikolov's simple dataset.
 This module will download dataset from 
 http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
 into paddle reader creators.
 """
 import paddle.v2.dataset.common
 import collections
 import tarfile
 __all__ = ['train', 'test', 'build_dict', 'convert']
 URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
 class DataType(object):
    NGRAM = 1
    SEQ = 2
 def word_count(f, word_freq=None):
    if word_freq is None:
        word_freq = collections.defaultdict(int)
    for l in f:
        for w in l.strip().split():
            word_freq[w] += 1
        word_freq['<s>'] += 1
        word_freq['<e>'] += 1
    return word_freq
 def build_dict(min_word_freq=50):
    """
    Build a word dictionary from the corpus,  Keys of the dictionary are words,
    and values are zero-based IDs of these words.
    """
    train_filename = './simple-examples/data/ptb.train.txt'
    test_filename = './simple-examples/data/ptb.valid.txt'
    with tarfile.open(
            paddle.v2.dataset.common.download(
                paddle.v2.dataset.imikolov.URL, 'imikolov',
                paddle.v2.dataset.imikolov.MD5)) as tf:
        trainf = tf.extractfile(train_filename)
        testf = tf.extractfile(test_filename)
        word_freq = word_count(testf, word_count(trainf))
        if '<unk>' in word_freq:
            # remove <unk> for now, since we will set it as last index
            del word_freq['<unk>']
        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
        words, _ = list(zip(*word_freq_sorted))
        word_idx = dict(zip(words, xrange(len(words))))
        word_idx['<unk>'] = len(words)
    return word_idx
 def reader_creator(filename, word_idx, n, data_type):
    def reader():
        with tarfile.open(
                paddle.v2.dataset.common.download(
                    paddle.v2.dataset.imikolov.URL, 'imikolov',
                    paddle.v2.dataset.imikolov.MD5)) as tf:
            f = tf.extractfile(filename)
            UNK = word_idx['<unk>']
            for l in f:
                if DataType.NGRAM == data_type:
                    assert n > -1, 'Invalid gram length'
                    l = ['<s>'] + l.strip().split() + ['<e>']
                    if len(l) >= n:
                        l = [word_idx.get(w, UNK) for w in l]
                        for i in range(n, len(l) + 1):
                            yield tuple(l[i - n:i])
                elif DataType.SEQ == data_type:
                    l = l.strip().split()
                    l = [word_idx.get(w, UNK) for w in l]
                    src_seq = [word_idx['<s>']] + l
                    trg_seq = l + [word_idx['<e>']]
                    if n > 0 and len(src_seq) > n: continue
                    yield src_seq, trg_seq
                else:
                    assert False, 'Unknow data type'
    return reader
 def train(word_idx, n, data_type=DataType.NGRAM):
    """
    imikolov training set creator.
    It returns a reader creator, each sample in the reader is a word ID
    tuple.
    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
                          data_type)
 def test(word_idx, n, data_type=DataType.NGRAM):
    """
    imikolov test set creator.
    It returns a reader creator, each sample in the reader is a word ID
    tuple.
    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Test reader creator
    :rtype: callable
    """
    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
                          data_type)
 def fetch():
    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
 def convert(path):
    """
    Converts dataset to recordio format
    """
    N = 5
    word_dict = build_dict()
    paddle.v2.dataset.common.convert(path,
                                     train(word_dict, N), 1000,
                                     "imikolov_train")
    paddle.v2.dataset.common.convert(path,
                                     test(word_dict, N), 1000, "imikolov_test")
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@ -0,0 +1,123 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 MNIST dataset.
 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
 parse training set and test set into paddle reader creators.
 """
 import paddle.v2.dataset.common
 import subprocess
 import numpy
 import platform
 __all__ = ['train', 'test', 'convert']
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
 TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
 TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
 TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
 TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
 TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
 TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
 TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
 def reader_creator(image_filename, label_filename, buffer_size):
    def reader():
        if platform.system() == 'Darwin':
            zcat_cmd = 'gzcat'
        elif platform.system() == 'Linux':
            zcat_cmd = 'zcat'
        else:
            raise NotImplementedError()
        # According to http://stackoverflow.com/a/38061619/724872, we
        # cannot use standard package gzip here.
        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
        m.stdout.read(16)  # skip some magic bytes
        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
        l.stdout.read(8)  # skip some magic bytes
        try:  # reader could be break.
            while True:
                labels = numpy.fromfile(
                    l.stdout, 'ubyte', count=buffer_size).astype("int")
                if labels.size != buffer_size:
                    break  # numpy.fromfile returns empty slice after EOF.
                images = numpy.fromfile(
                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
                        (buffer_size, 28 * 28)).astype('float32')
                images = images / 255.0 * 2.0 - 1.0
                for i in xrange(buffer_size):
                    yield images[i, :], int(labels[i])
        finally:
            m.terminate()
            l.terminate()
    return reader
 def train():
    """
    MNIST training set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
                                          TRAIN_IMAGE_MD5),
        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
                                          TRAIN_LABEL_MD5), 100)
 def test():
    """
    MNIST test set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
                                          TEST_IMAGE_MD5),
        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
                                          TEST_LABEL_MD5), 100)
 def fetch():
    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
 def convert(path):
    """
    Converts dataset to recordio format
    """
    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@ -0,0 +1,141 @@
 # /usr/bin/env python
 # -*- coding:utf-8 -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 The script fetch and preprocess movie_reviews data set that provided by NLTK
 TODO(yuyang18): Complete dataset.
 """
 import collections
 from itertools import chain
 import nltk
 from nltk.corpus import movie_reviews
 import paddle.v2.dataset.common
 __all__ = ['train', 'test', 'get_word_dict', 'convert']
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
 def download_data_if_not_yet():
    """
    Download the data set, if the data set is not download.
    """
    try:
        # make sure that nltk can find the data
        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
        movie_reviews.categories()
    except LookupError:
        print "Downloading movie_reviews data set, please wait....."
        nltk.download(
            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
        print "Download data set success....."
        print "Path is " + nltk.data.find('corpora/movie_reviews').path
 def get_word_dict():
    """
    Sorted the words by the frequency of words which occur in sample
    :return:
        words_freq_sorted
    """
    words_freq_sorted = list()
    word_freq_dict = collections.defaultdict(int)
    download_data_if_not_yet()
    for category in movie_reviews.categories():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
    words_sort_list = word_freq_dict.items()
    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))
    return words_freq_sorted
 def sort_files():
    """
    Sorted the sample for cross reading the sample
    :return:
        files_list
    """
    files_list = list()
    neg_file_list = movie_reviews.fileids('neg')
    pos_file_list = movie_reviews.fileids('pos')
    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
    return files_list
 def load_sentiment_data():
    """
    Load the data set
    :return:
        data_set
    """
    data_set = list()
    download_data_if_not_yet()
    words_ids = dict(get_word_dict())
    for sample_file in sort_files():
        words_list = list()
        category = 0 if 'neg' in sample_file else 1
        for word in movie_reviews.words(sample_file):
            words_list.append(words_ids[word.lower()])
        data_set.append((words_list, category))
    return data_set
 def reader_creator(data):
    """
    Reader creator, generate an iterator for data set
    :param data:
        train data set or test data set
    """
    for each in data:
        yield each[0], each[1]
 def train():
    """
    Default training set reader creator
    """
    data_set = load_sentiment_data()
    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
 def test():
    """
    Default test set reader creator
    """
    data_set = load_sentiment_data()
    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
 def fetch():
    nltk.download(
        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
 def convert(path):
    """
    Converts dataset to recordio format
    """
    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
--- a/python/paddle/v2/dataset/tests/cifar_test.py
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
@ -0,0 +1,56 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.cifar
 import unittest
 class TestCIFAR(unittest.TestCase):
    def check_reader(self, reader):
        sum = 0
        label = 0
        for l in reader():
            self.assertEqual(l[0].size, 3072)
            if l[1] > label:
                label = l[1]
            sum += 1
        return sum, label
    def test_test10(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.cifar.test10())
        self.assertEqual(instances, 10000)
        self.assertEqual(max_label_value, 9)
    def test_train10(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.cifar.train10())
        self.assertEqual(instances, 50000)
        self.assertEqual(max_label_value, 9)
    def test_test100(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.cifar.test100())
        self.assertEqual(instances, 10000)
        self.assertEqual(max_label_value, 99)
    def test_train100(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.cifar.train100())
        self.assertEqual(instances, 50000)
        self.assertEqual(max_label_value, 99)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@ -0,0 +1,94 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.common
 import unittest
 import tempfile
 import glob
 class TestCommon(unittest.TestCase):
    def test_md5file(self):
        _, temp_path = tempfile.mkstemp()
        with open(temp_path, 'w') as f:
            f.write("Hello\n")
        self.assertEqual('09f7e02f1290be211da707a266f153b3',
                         paddle.v2.dataset.common.md5file(temp_path))
    def test_download(self):
        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
        self.assertEqual(
            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
            paddle.v2.dataset.common.download(
                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
    def test_split(self):
        def test_reader():
            def reader():
                for x in xrange(10):
                    yield x
            return reader
        _, temp_path = tempfile.mkstemp()
        paddle.v2.dataset.common.split(
            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
        files = glob.glob(temp_path + '/test-%05d.pickle')
        self.assertEqual(len(files), 3)
    def test_cluster_file_reader(self):
        _, temp_path = tempfile.mkstemp()
        for x in xrange(5):
            with open(temp_path + '/%05d.test' % x) as f:
                f.write('%d\n' % x)
        reader = paddle.v2.dataset.common.cluster_files_reader(
            temp_path + '/*.test', 5, 0)
        for idx, e in enumerate(reader()):
            self.assertEqual(e, str("0"))
    def test_convert(self):
        record_num = 10
        num_shards = 4
        def test_reader():
            def reader():
                for x in xrange(record_num):
                    yield x
            return reader
        path = tempfile.mkdtemp()
        paddle.v2.dataset.common.convert(path,
                                         test_reader(), num_shards,
                                         'random_images')
        files = glob.glob(path + '/random_images-*')
        self.assertEqual(len(files), num_shards)
        recs = []
        for i in range(0, num_shards):
            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
            r = recordio.reader(n)
            while True:
                d = r.read()
                if d is None:
                    break
                recs.append(d)
        recs.sort()
        self.assertEqual(total, record_num)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@ -0,0 +1,51 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.flowers
 import unittest
 class TestFlowers(unittest.TestCase):
    def check_reader(self, reader):
        sum = 0
        label = 0
        size = 224 * 224 * 3
        for l in reader():
            self.assertEqual(l[0].size, size)
            if l[1] > label:
                label = l[1]
            sum += 1
        return sum, label
    def test_train(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.flowers.train())
        self.assertEqual(instances, 6149)
        self.assertEqual(max_label_value, 102)
    def test_test(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.flowers.test())
        self.assertEqual(instances, 1020)
        self.assertEqual(max_label_value, 102)
    def test_valid(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.flowers.valid())
        self.assertEqual(instances, 1020)
        self.assertEqual(max_label_value, 102)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/imdb_test.py
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@ -0,0 +1,57 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.imdb
 import unittest
 import re
 TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
 TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
 TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
 TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
 TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
 TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
 class TestIMDB(unittest.TestCase):
    word_idx = None
    def test_build_dict(self):
        if self.word_idx == None:
            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
                                                              150)
        self.assertEqual(len(self.word_idx), 7036)
    def check_dataset(self, dataset, expected_size):
        if self.word_idx == None:
            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
                                                              150)
        sum = 0
        for l in dataset(self.word_idx):
            self.assertEqual(l[1], sum % 2)
            sum += 1
        self.assertEqual(sum, expected_size)
    def test_train(self):
        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
    def test_test(self):
        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@ -0,0 +1,67 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.imikolov
 import unittest
 WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
 class TestMikolov(unittest.TestCase):
    def check_reader(self, reader, n):
        for l in reader():
            self.assertEqual(len(l), n)
    def test_train(self):
        n = 5
        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
            'rake regatta rubens sim snack-food ssangyong swapo wachter'
        first_line = [
            WORD_DICT.get(ch, WORD_DICT['<unk>'])
            for ch in first_line.split(' ')
        ]
        for l in paddle.v2.dataset.imikolov.train(
                WORD_DICT, n=-1,
                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
            read_line = l[0][1:]
            break
        self.assertEqual(first_line, read_line)
    def test_test(self):
        n = 5
        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
        first_line = 'consumers may want to move their telephones a little '\
                'closer to the tv set'
        first_line = [
            WORD_DICT.get(ch, WORD_DICT['<unk>'])
            for ch in first_line.split(' ')
        ]
        for l in paddle.v2.dataset.imikolov.test(
                WORD_DICT, n=-1,
                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
            read_line = l[0][1:]
            break
        self.assertEqual(first_line, read_line)
    def test_total(self):
        _, idx = zip(*WORD_DICT.items())
        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/mnist_test.py
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
@ -0,0 +1,44 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.mnist
 import unittest
 class TestMNIST(unittest.TestCase):
    def check_reader(self, reader):
        sum = 0
        label = 0
        for l in reader():
            self.assertEqual(l[0].size, 784)
            if l[1] > label:
                label = l[1]
            sum += 1
        return sum, label
    def test_train(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.mnist.train())
        self.assertEqual(instances, 60000)
        self.assertEqual(max_label_value, 9)
    def test_test(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.mnist.test())
        self.assertEqual(instances, 10000)
        self.assertEqual(max_label_value, 9)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/mq2007_test.py
+++ b/python/paddle/v2/dataset/tests/mq2007_test.py
@ -0,0 +1,33 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.mq2007
 import unittest
 class TestMQ2007(unittest.TestCase):
    def test_pairwise(self):
        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
                format="pairwise"):
            self.assertEqual(query_left.shape(), (46, ))
            self.assertEqual(query_right.shape(), (46, ))
    def test_listwise(self):
        for label_array, query_array in paddle.v2.dataset.mq2007.test(
                format="listwise"):
            self.assertEqual(len(label_array), len(query_array))
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/dataset/tests/test_sentiment.py
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
@ -0,0 +1,55 @@
 # /usr/bin/env python
 # -*- coding:utf-8 -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 import nltk
 import paddle.v2.dataset.sentiment as st
 from nltk.corpus import movie_reviews
 class TestSentimentMethods(unittest.TestCase):
    def test_get_word_dict(self):
        word_dict = st.get_word_dict()[0:10]
        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
                          (u'is', 8), (u'in', 9)]
        for idx, each in enumerate(word_dict):
            self.assertEqual(each, test_word_list[idx])
        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
    def test_sort_files(self):
        last_label = ''
        for sample_file in st.sort_files():
            current_label = sample_file.split("/")[0]
            self.assertNotEqual(current_label, last_label)
            last_label = current_label
    def test_data_set(self):
        data_set = st.load_sentiment_data()
        last_label = -1
        for each in st.test():
            self.assertNotEqual(each[1], last_label)
            last_label = each[1]
        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
        self.assertEqual(
            len(list(st.test())),
            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/voc2012_test.py
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@ -0,0 +1,42 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.voc2012
 import unittest
 class TestVOC(unittest.TestCase):
    def check_reader(self, reader):
        sum = 0
        label = 0
        for l in reader():
            self.assertEqual(l[0].size, 3 * l[1].size)
            sum += 1
        return sum
    def test_train(self):
        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
        self.assertEqual(count, 2913)
    def test_test(self):
        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
        self.assertEqual(count, 1464)
    def test_val(self):
        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
        self.assertEqual(count, 1449)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/wmt16_test.py
+++ b/python/paddle/v2/dataset/tests/wmt16_test.py
@ -0,0 +1,66 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2.dataset.wmt16
 import unittest
 class TestWMT16(unittest.TestCase):
    def checkout_one_sample(self, sample):
        # train data has 3 field: source language word indices,
        # target language word indices, and target next word indices.
        self.assertEqual(len(sample), 3)
        # test start mark and end mark in source word indices.
        self.assertEqual(sample[0][0], 0)
        self.assertEqual(sample[0][-1], 1)
        # test start mask in target word indices
        self.assertEqual(sample[1][0], 0)
        # test en mask in target next word indices
        self.assertEqual(sample[2][-1], 1)
    def test_train(self):
        for idx, sample in enumerate(
                paddle.v2.dataset.wmt16.train(
                    src_dict_size=100000, trg_dict_size=100000)()):
            if idx >= 10: break
            self.checkout_one_sample(sample)
    def test_test(self):
        for idx, sample in enumerate(
                paddle.v2.dataset.wmt16.test(
                    src_dict_size=1000, trg_dict_size=1000)()):
            if idx >= 10: break
            self.checkout_one_sample(sample)
    def test_val(self):
        for idx, sample in enumerate(
                paddle.v2.dataset.wmt16.validation(
                    src_dict_size=1000, trg_dict_size=1000)()):
            if idx >= 10: break
            self.checkout_one_sample(sample)
    def test_get_dict(self):
        dict_size = 1000
        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
        self.assertEqual(len(word_dict), dict_size)
        self.assertEqual(word_dict[0], "<s>")
        self.assertEqual(word_dict[1], "<e>")
        self.assertEqual(word_dict[2], "<unk>")
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@ -0,0 +1,134 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 UCI Housing dataset.
 This module will download dataset from
 https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """
 import numpy as np
 import os
 import paddle.v2.dataset.common
 from paddle.v2.parameters import Parameters
 __all__ = ['train', 'test']
 URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
 feature_names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'convert'
 ]
 UCI_TRAIN_DATA = None
 UCI_TEST_DATA = None
 URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
 MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 def feature_range(maximums, minimums):
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    feature_num = len(maximums)
    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
    ax.set_title('feature scale')
    plt.xticks(range(feature_num), feature_names)
    plt.xlim([-1, feature_num])
    fig.set_figheight(6)
    fig.set_figwidth(10)
    if not os.path.exists('./image'):
        os.makedirs('./image')
    fig.savefig('image/ranges.png', dpi=48)
    plt.close(fig)
 def load_data(filename, feature_num=14, ratio=0.8):
    global UCI_TRAIN_DATA, UCI_TEST_DATA
    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
        return
    data = np.fromfile(filename, sep=' ')
    data = data.reshape(data.shape[0] / feature_num, feature_num)
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
    feature_range(maximums[:-1], minimums[:-1])
    for i in xrange(feature_num - 1):
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    offset = int(data.shape[0] * ratio)
    UCI_TRAIN_DATA = data[:offset]
    UCI_TEST_DATA = data[offset:]
 def train():
    """
    UCI_HOUSING training set creator.
    It returns a reader creator, each sample in the reader is features after
    normalization and price number.
    :return: Training reader creator
    :rtype: callable
    """
    global UCI_TRAIN_DATA
    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
    def reader():
        for d in UCI_TRAIN_DATA:
            yield d[:-1], d[-1:]
    return reader
 def test():
    """
    UCI_HOUSING test set creator.
    It returns a reader creator, each sample in the reader is features after
    normalization and price number.
    :return: Test reader creator
    :rtype: callable
    """
    global UCI_TEST_DATA
    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
    def reader():
        for d in UCI_TEST_DATA:
            yield d[:-1], d[-1:]
    return reader
 def model():
    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
                                                 MD5_MODEL)
    with open(tar_file, 'r') as f:
        parameters = Parameters.from_tar(f)
    return parameters
 def fetch():
    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
 def convert(path):
    """
    Converts dataset to recordio format
    """
    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
--- a/Show More
+++ b/Show More