add doc for some v2/dataset

9 years ago · 67d4d89cc4
parent 9f417f129d
commit 67d4d89cc4
13 changed files with 197 additions and 41 deletions
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@ -49,7 +49,6 @@ mnist
    :members:
    :noindex:

-
 cifar
 +++++

@ -61,7 +60,7 @@ conll05
 +++++++

 ..  automodule:: paddle.v2.dataset.conll05
-    :members:
+    :members: get_dict,get_embedding,test
    :noindex:

 imdb
@ -79,12 +78,18 @@ imikolov
    :noindex:

 movielens
-+++++++++
+++++++++    

 ..  automodule:: paddle.v2.dataset.movielens
    :members:
    :noindex:

+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
 sentiment
 +++++++++

@ -102,7 +107,7 @@ uci_housing
 wmt14
 +++++

-..  automodule:: paddle.v2.dataset.uci_housing
+..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:

--- a/doc/api/v2/run_logic.rst
+++ b/doc/api/v2/run_logic.rst
@ -13,25 +13,18 @@ Trainer
 =======

 ..  automodule:: paddle.v2.trainer
-    :members: Trainer
+    :members: SGD
    :noindex:

 Event
 =====

 ..  automodule:: paddle.v2.event
-    :members: Event
+    :members: 
    :noindex:

 Inference
 =========

-..  automodule:: paddle.v2.inference
-    :members: Inference
-    :noindex:
-
 ..  autofunction:: paddle.v2.infer
-    :members:
-    :noindex:
-
-
+    :noindex:
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@ -52,7 +52,7 @@ class DataFeeder(DataProviderConverter):
        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
        #                     ]
-        arg = feeder(minibatch_data)
+        arg = feeder.convert(minibatch_data)

    ..  note::

--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@ -15,7 +15,7 @@
 CIFAR dataset.

 This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and
-parse train set and test set into paddle reader creators.
+parse train/test set into paddle reader creators.

 The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 
 images per class. There are 50000 training images and 10000 test images.
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Conll 2005 dataset.  Paddle semantic role labeling Book and demo use this
-dataset as an example. Because Conll 2005 is not free in public, the default
-downloaded URL is test set of Conll 2005 (which is public). Users can change
-URL and MD5 to their Conll dataset.
-
-TODO(yuyang18): Complete comments.
+Conll05 dataset.  
+Paddle semantic role labeling Book and demo use this dataset as an example. Because 
+Conll05 is not free in public, the default downloaded URL is test set of 
+Conll05 (which is public). Users can change URL and MD5 to their Conll dataset. 
+And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model.
 """

 import tarfile
@ -180,6 +179,9 @@ def reader_creator(corpus_reader,


 def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
@ -187,10 +189,23 @@ def get_dict():


 def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
    return download(EMB_URL, 'conll05st', EMB_MD5)


 def test():
+    """
+    Conll05 test set creator.
+
+    Because the train dataset is not free, the test dataset is used for training.
+    It returns a reader creator, each sample in the reader is nine features, including sentence 
+    sequence, predicate, predicate context, predicate context flag and tagged sequence.
+
+    :return: Train reader creator
+    :rtype: callable
+    """
    word_dict, verb_dict, label_dict = get_dict()
    reader = corpus_reader(
        download(DATA_URL, 'conll05st', DATA_MD5),
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+IMDB dataset.

-TODO(yuyang18): Complete comments.
+This module download IMDB dataset from 
+http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000 
+highly polar movie reviews for training, and 25,000 for testing. Besides, this 
+module also provides API for build dictionary and parse train set and test set 
+into paddle reader creators.
 """

 import paddle.v2.dataset.common
@ -30,8 +34,11 @@ URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'


-# Read files that match pattern.  Tokenize and yield each file.
 def tokenize(pattern):
+    """
+    Read files that match pattern.  Tokenize and yield each file.
+    """
+
    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
                                                        MD5)) as tarf:
        # Note that we should use tarfile.next(), which does
@ -48,6 +55,9 @@ def tokenize(pattern):


 def build_dict(pattern, cutoff):
+    """
+    Build a word dictionary, the key is word, and the value is index.
+    """
    word_freq = {}
    for doc in tokenize(pattern):
        for word in doc:
@ -109,18 +119,46 @@ def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):


 def train(word_idx):
+    """
+    IMDB train set creator.
+
+    It returns a reader creator, each sample in the reader is an index 
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Train reader creator
+    :rtype: callable
+    """
    return reader_creator(
        re.compile("aclImdb/train/pos/.*\.txt$"),
        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)


 def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an index 
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
    return reader_creator(
        re.compile("aclImdb/test/pos/.*\.txt$"),
        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)


 def word_dict():
+    """
+    Build word dictionary.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
    return build_dict(
        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)

--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
+imikolov's simple dataset.

-Complete comments.
+This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and
+parse train/test set into paddle reader creators.
 """
 import paddle.v2.dataset.common
 import tarfile
@ -40,6 +41,9 @@ def word_count(f, word_freq=None):


 def build_dict():
+    """
+    Build a word dictionary, the key is word, and the value is index.
+    """
    train_filename = './simple-examples/data/ptb.train.txt'
    test_filename = './simple-examples/data/ptb.valid.txt'
    with tarfile.open(
@ -84,10 +88,36 @@ def reader_creator(filename, word_idx, n):


 def train(word_idx, n):
+    """
+    imikolov train set creator.
+
+    It returns a reader creator, each sample in the reader is an index 
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size
+    :type n: int
+    :return: Train reader creator
+    :rtype: callable
+    """
    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)


 def test(word_idx, n):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is an index 
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size
+    :type n: int
+    :return: Train reader creator
+    :rtype: callable
+    """
    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)


--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@ -15,7 +15,7 @@
 MNIST dataset.

 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
-parse train set and test set into paddle reader creators.
+parse train/test set into paddle reader creators.
 """
 import paddle.v2.dataset.common
 import subprocess
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@ -14,11 +14,11 @@
 """
 Movielens 1-M dataset.

-GroupLens Research collected and made available rating data sets from the 
-MovieLens web site (http://movielens.org). Movielens 1-M dataset contains 1 million 
-ratings from 6000 users on 4000 movies. 
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 movies, which was 
+collected by GroupLens Research. This module will download Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test set 
+into paddle reader creators.

-TODO(yuyang18): Complete comments.
 """

 import zipfile
@ -39,12 +39,18 @@ MD5 = 'c4d9eecfca2ab87c1945afe126590906'


 class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
    def __init__(self, index, categories, title):
        self.index = int(index)
        self.categories = categories
        self.title = title

    def value(self):
+        """
+        Get information of a movie.
+        """
        return [
            self.index, [CATEGORIES_DICT[c] for c in self.categories],
            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
@ -59,6 +65,9 @@ class MovieInfo(object):


 class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
    def __init__(self, index, gender, age, job_id):
        self.index = int(index)
        self.is_male = gender == 'M'
@ -66,6 +75,9 @@ class UserInfo(object):
        self.job_id = int(job_id)

    def value(self):
+        """
+        Get information of a user.
+        """
        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]

    def __str__(self):
@ -152,6 +164,9 @@ test = functools.partial(__reader_creator__, is_test=True)


 def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
    __initialize_meta_info__()
    return MOVIE_TITLE_DICT

@ -164,11 +179,17 @@ def __max_index_info__(a, b):


 def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
    __initialize_meta_info__()
    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index


 def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
    __initialize_meta_info__()
    return reduce(__max_index_info__, USER_INFO.viewvalues()).index

@ -181,21 +202,33 @@ def __max_job_id_impl__(a, b):


 def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
    __initialize_meta_info__()
    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id


 def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
    __initialize_meta_info__()
    return CATEGORIES_DICT


 def user_info():
+    """
+    Get user info dictionary.
+    """
    __initialize_meta_info__()
    return USER_INFO


 def movie_info():
+    """
+    Get movie info dictionary.
+    """
    __initialize_meta_info__()
    return MOVIE_INFO

--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@ -14,7 +14,9 @@
 """
 UCI Housing dataset.

-TODO(yuyang18): Complete comments.
+This module will download dataset from 
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse train/test set into paddle reader creators.
 """

 import numpy as np
@ -70,6 +72,15 @@ def load_data(filename, feature_num=14, ratio=0.8):


 def train():
+    """
+    UCI_HOUSING train set creator.
+
+    It returns a reader creator, each sample in the reader is features after normalization 
+    and price number.
+
+    :return: Train reader creator
+    :rtype: callable
+    """
    global UCI_TRAIN_DATA
    load_data(download(URL, 'uci_housing', MD5))

@ -81,6 +92,15 @@ def train():


 def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after normalization
+    and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
    global UCI_TEST_DATA
    load_data(download(URL, 'uci_housing', MD5))

--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-wmt14 dataset
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is provided.
+This module will download dataset from 
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse train/test set into paddle reader creators.
+
 """
 import tarfile

@ -94,11 +99,29 @@ def reader_creator(tar_file, file_name, dict_size):


 def train(dict_size):
+    """
+    WMT14 train set creator.
+
+    It returns a reader creator, each sample in the reader is source language word index 
+    sequence, target language word index sequence and next word index sequence.
+
+    :return: Train reader creator
+    :rtype: callable
+    """
    return reader_creator(
        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)


 def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language word index 
+    sequence, target language word index sequence and next word index sequence.
+
+    :return: Train reader creator
+    :rtype: callable
+    """
    return reader_creator(
        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)

--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@ -1,14 +1,13 @@
 """
-All training events.
+Testing and training events.

 There are:

+* TestResult
 * BeginIteration
 * EndIteration
 * BeginPass
 * EndPass
-
-TODO(yuyang18): Complete it!
 """
 import py_paddle.swig_paddle as api

--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@ -1,3 +1,6 @@
+"""
+Trainer package
+"""
 import collections

 import py_paddle.swig_paddle as api
@ -9,10 +12,7 @@ from . import optimizer as v2_optimizer
 from . import parameters as v2_parameters

 __all__ = ['SGD']
-"""
-Trainer package
-TODO(yuyang18): Complete comments.
-"""
+


 def default_event_handler(event):