follow comments

8 years ago · 1e29b12425
parent 5dd7586e68
commit 1e29b12425
8 changed files with 55 additions and 45 deletions
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@ -14,14 +14,17 @@
 """
 CIFAR dataset.

-This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and
-parse train/test set into paddle reader creators.
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.

-The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000
-images per class. There are 50000 training images and 10000 test images.
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.

-The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes containing
-600 images each. There are 500 training images and 100 testing images per class.
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.

 """

--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@ -13,10 +13,11 @@
 # limitations under the License.
 """
 Conll05 dataset.
-Paddle semantic role labeling Book and demo use this dataset as an example. Because
-Conll05 is not free in public, the default downloaded URL is test set of
-Conll05 (which is public). Users can change URL and MD5 to their Conll dataset.
-And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
 """

 import tarfile
@ -198,9 +199,10 @@ def test():
    """
    Conll05 test set creator.

-    Because the train dataset is not free, the test dataset is used for training.
-    It returns a reader creator, each sample in the reader is nine features, including sentence
-    sequence, predicate, predicate context, predicate context flag and tagged sequence.
+    Because the train dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.

    :return: Train reader creator
    :rtype: callable
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@ -14,11 +14,10 @@
 """
 IMDB dataset.

-This module download IMDB dataset from
-http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000
-highly polar movie reviews for training, and 25,000 for testing. Besides, this
-module also provides API for build dictionary and parse train set and test set
-into paddle reader creators.
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
 """

 import paddle.v2.dataset.common
@ -37,7 +36,7 @@ MD5 = '7c2ac02c03563afcf9b574c7e56c153a'

 def tokenize(pattern):
    """
-    Read files that match pattern.  Tokenize and yield each file.
+    Read files that match the given pattern.  Tokenize and yield each file.
    """

    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
@ -57,7 +56,8 @@ def tokenize(pattern):

 def build_dict(pattern, cutoff):
    """
-    Build a word dictionary, the key is word, and the value is index.
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
    """
    word_freq = collections.defaultdict(int)
    for doc in tokenize(pattern):
@ -123,7 +123,7 @@ def train(word_idx):
    """
    IMDB train set creator.

-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
@ -140,7 +140,7 @@ def test(word_idx):
    """
    IMDB test set creator.

-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
@ -155,7 +155,7 @@ def test(word_idx):

 def word_dict():
    """
-    Build word dictionary.
+    Build a word dictionary from the corpus.

    :return: Word dictionary
    :rtype: dict
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@ -14,8 +14,9 @@
 """
 imikolov's simple dataset.

-This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and
-parse train/test set into paddle reader creators.
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse train/test set into paddle
+reader creators.
 """
 import paddle.v2.dataset.common
 import collections
@ -42,7 +43,8 @@ def word_count(f, word_freq=None):

 def build_dict():
    """
-    Build a word dictionary, the key is word, and the value is index.
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
    """
    train_filename = './simple-examples/data/ptb.train.txt'
    test_filename = './simple-examples/data/ptb.valid.txt'
@ -91,7 +93,7 @@ def train(word_idx, n):
    """
    imikolov train set creator.

-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
@ -108,7 +110,7 @@ def test(word_idx, n):
    """
    imikolov test set creator.

-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@ -14,10 +14,11 @@
 """
 Movielens 1-M dataset.

-Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 movies, which was
-collected by GroupLens Research. This module will download Movielens 1-M dataset from
-http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test set
-into paddle reader creators.
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test
+set into paddle reader creators.

 """

@ -50,7 +51,7 @@ class MovieInfo(object):

    def value(self):
        """
-        Get information of a movie.
+        Get information from a movie.
        """
        return [
            self.index, [CATEGORIES_DICT[c] for c in self.categories],
@ -78,7 +79,7 @@ class UserInfo(object):

    def value(self):
        """
-        Get information of a user.
+        Get information from a user.
        """
        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]

--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@ -75,8 +75,8 @@ def train():
    """
    UCI_HOUSING train set creator.

-    It returns a reader creator, each sample in the reader is features after normalization
-    and price number.
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.

    :return: Train reader creator
    :rtype: callable
@ -95,8 +95,8 @@ def test():
    """
    UCI_HOUSING test set creator.

-    It returns a reader creator, each sample in the reader is features after normalization
-    and price number.
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.

    :return: Test reader creator
    :rtype: callable
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@ -13,8 +13,8 @@
 # limitations under the License.
 """
 WMT14 dataset.
-The original WMT14 dataset is too large and a small set of data for set is provided.
-This module will download dataset from
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
 http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse train/test set into paddle reader creators.

@ -107,8 +107,9 @@ def train(dict_size):
    """
    WMT14 train set creator.

-    It returns a reader creator, each sample in the reader is source language word index
-    sequence, target language word index sequence and next word index sequence.
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.

    :return: Train reader creator
    :rtype: callable
@ -121,8 +122,9 @@ def test(dict_size):
    """
    WMT14 test set creator.

-    It returns a reader creator, each sample in the reader is source language word index
-    sequence, target language word index sequence and next word index sequence.
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.

    :return: Train reader creator
    :rtype: callable
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@ -1,5 +1,5 @@
 """
-Trainer package
+Module Trainer
 """
 import collections