From d1ab3c80744a851164dd4dc76a847193eb4c5562 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 23 Feb 2017 19:20:33 +0800 Subject: [PATCH 1/5] MNIST dataset reader implementation --- python/paddle/v2/data_set/__init__.py | 0 python/paddle/v2/data_set/mnist.py | 62 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 python/paddle/v2/data_set/__init__.py create mode 100644 python/paddle/v2/data_set/mnist.py diff --git a/python/paddle/v2/data_set/__init__.py b/python/paddle/v2/data_set/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/paddle/v2/data_set/mnist.py b/python/paddle/v2/data_set/mnist.py new file mode 100644 index 0000000000..34f61bb9f6 --- /dev/null +++ b/python/paddle/v2/data_set/mnist.py @@ -0,0 +1,62 @@ +import sklearn.datasets.mldata +import sklearn.model_selection +import numpy + +__all__ = ['MNISTReader', 'train_reader_creator', 'test_reader_creator'] + +DATA_HOME = None + + +def __mnist_reader__(data, target): + n_samples = data.shape[0] + for i in xrange(n_samples): + yield data[i].astype(numpy.float32), int(target[i]) + + +class MNISTReader(object): + """ + mnist dataset reader. The `train_reader` and `test_reader` method returns + a iterator of each sample. Each sample is combined by 784-dim float and a + one-dim label + """ + + def __init__(self, random_state): + data = sklearn.datasets.mldata.fetch_mldata( + "MNIST original", data_home=DATA_HOME) + n_train = 60000 + self.X_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split( + data.data / 255.0, + data.target.astype("int"), + train_size=n_train, + random_state=random_state) + + def train_reader(self): + return __mnist_reader__(self.X_train, self.y_train) + + def test_reader(self): + return __mnist_reader__(self.X_test, self.y_test) + + +__default_instance__ = MNISTReader(0) + + +def train_reader_creator(): + """ + Default train set reader creator. + """ + return __default_instance__.train_reader + + +def test_reader_creator(): + """ + Default test set reader creator. + """ + return __default_instance__.test_reader + + +def unittest(): + assert len(list(train_reader_creator()())) == 60000 + + +if __name__ == '__main__': + unittest() From 38a792f20ed9e65d2920ded6ad42a5b68f2146ee Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 24 Feb 2017 13:52:31 +0800 Subject: [PATCH 2/5] Clean mnist code --- python/paddle/v2/data_set/config.py | 8 ++++ python/paddle/v2/data_set/mnist.py | 58 +++++++++++++---------------- 2 files changed, 33 insertions(+), 33 deletions(-) create mode 100644 python/paddle/v2/data_set/config.py diff --git a/python/paddle/v2/data_set/config.py b/python/paddle/v2/data_set/config.py new file mode 100644 index 0000000000..69e96d65ef --- /dev/null +++ b/python/paddle/v2/data_set/config.py @@ -0,0 +1,8 @@ +import os + +__all__ = ['DATA_HOME'] + +DATA_HOME = os.path.expanduser('~/.cache/paddle_data_set') + +if not os.path.exists(DATA_HOME): + os.makedirs(DATA_HOME) diff --git a/python/paddle/v2/data_set/mnist.py b/python/paddle/v2/data_set/mnist.py index 34f61bb9f6..6f35acf683 100644 --- a/python/paddle/v2/data_set/mnist.py +++ b/python/paddle/v2/data_set/mnist.py @@ -1,61 +1,53 @@ import sklearn.datasets.mldata import sklearn.model_selection import numpy +from config import DATA_HOME -__all__ = ['MNISTReader', 'train_reader_creator', 'test_reader_creator'] +__all__ = ['MNIST', 'train_creator', 'test_creator'] -DATA_HOME = None +def __mnist_reader_creator__(data, target): + def reader(): + n_samples = data.shape[0] + for i in xrange(n_samples): + yield (data[i] / 255.0).astype(numpy.float32), int(target[i]) -def __mnist_reader__(data, target): - n_samples = data.shape[0] - for i in xrange(n_samples): - yield data[i].astype(numpy.float32), int(target[i]) + return reader -class MNISTReader(object): +class MNIST(object): """ mnist dataset reader. The `train_reader` and `test_reader` method returns a iterator of each sample. Each sample is combined by 784-dim float and a one-dim label """ - def __init__(self, random_state): + def __init__(self, random_state=0, test_size=10000, **options): data = sklearn.datasets.mldata.fetch_mldata( "MNIST original", data_home=DATA_HOME) - n_train = 60000 self.X_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split( - data.data / 255.0, - data.target.astype("int"), - train_size=n_train, - random_state=random_state) + data.data, + data.target, + test_size=test_size, + random_state=random_state, + **options) - def train_reader(self): - return __mnist_reader__(self.X_train, self.y_train) + def train_creator(self): + return __mnist_reader_creator__(self.X_train, self.y_train) - def test_reader(self): - return __mnist_reader__(self.X_test, self.y_test) + def test_creator(self): + return __mnist_reader_creator__(self.X_test, self.y_test) -__default_instance__ = MNISTReader(0) - - -def train_reader_creator(): - """ - Default train set reader creator. - """ - return __default_instance__.train_reader - - -def test_reader_creator(): - """ - Default test set reader creator. - """ - return __default_instance__.test_reader +__default_instance__ = MNIST() +train_creator = __default_instance__.train_creator +test_creator = __default_instance__.test_creator def unittest(): - assert len(list(train_reader_creator()())) == 60000 + size = 12045 + mnist = MNIST(test_size=size) + assert len(list(mnist.test_creator()())) == size if __name__ == '__main__': From ef9041c07bdf5d5f86b0b5b12045b4cec3719953 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 23 Feb 2017 19:20:33 +0800 Subject: [PATCH 3/5] MNIST dataset reader implementation --- python/paddle/v2/data_set/__init__.py | 0 python/paddle/v2/data_set/mnist.py | 62 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 python/paddle/v2/data_set/__init__.py create mode 100644 python/paddle/v2/data_set/mnist.py diff --git a/python/paddle/v2/data_set/__init__.py b/python/paddle/v2/data_set/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/paddle/v2/data_set/mnist.py b/python/paddle/v2/data_set/mnist.py new file mode 100644 index 0000000000..34f61bb9f6 --- /dev/null +++ b/python/paddle/v2/data_set/mnist.py @@ -0,0 +1,62 @@ +import sklearn.datasets.mldata +import sklearn.model_selection +import numpy + +__all__ = ['MNISTReader', 'train_reader_creator', 'test_reader_creator'] + +DATA_HOME = None + + +def __mnist_reader__(data, target): + n_samples = data.shape[0] + for i in xrange(n_samples): + yield data[i].astype(numpy.float32), int(target[i]) + + +class MNISTReader(object): + """ + mnist dataset reader. The `train_reader` and `test_reader` method returns + a iterator of each sample. Each sample is combined by 784-dim float and a + one-dim label + """ + + def __init__(self, random_state): + data = sklearn.datasets.mldata.fetch_mldata( + "MNIST original", data_home=DATA_HOME) + n_train = 60000 + self.X_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split( + data.data / 255.0, + data.target.astype("int"), + train_size=n_train, + random_state=random_state) + + def train_reader(self): + return __mnist_reader__(self.X_train, self.y_train) + + def test_reader(self): + return __mnist_reader__(self.X_test, self.y_test) + + +__default_instance__ = MNISTReader(0) + + +def train_reader_creator(): + """ + Default train set reader creator. + """ + return __default_instance__.train_reader + + +def test_reader_creator(): + """ + Default test set reader creator. + """ + return __default_instance__.test_reader + + +def unittest(): + assert len(list(train_reader_creator()())) == 60000 + + +if __name__ == '__main__': + unittest() From befc3e066b633ae2a9e0c448037a93ede6de4ddf Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 24 Feb 2017 13:52:31 +0800 Subject: [PATCH 4/5] Clean mnist code --- python/paddle/v2/data_set/config.py | 8 ++++ python/paddle/v2/data_set/mnist.py | 58 +++++++++++++---------------- 2 files changed, 33 insertions(+), 33 deletions(-) create mode 100644 python/paddle/v2/data_set/config.py diff --git a/python/paddle/v2/data_set/config.py b/python/paddle/v2/data_set/config.py new file mode 100644 index 0000000000..69e96d65ef --- /dev/null +++ b/python/paddle/v2/data_set/config.py @@ -0,0 +1,8 @@ +import os + +__all__ = ['DATA_HOME'] + +DATA_HOME = os.path.expanduser('~/.cache/paddle_data_set') + +if not os.path.exists(DATA_HOME): + os.makedirs(DATA_HOME) diff --git a/python/paddle/v2/data_set/mnist.py b/python/paddle/v2/data_set/mnist.py index 34f61bb9f6..6f35acf683 100644 --- a/python/paddle/v2/data_set/mnist.py +++ b/python/paddle/v2/data_set/mnist.py @@ -1,61 +1,53 @@ import sklearn.datasets.mldata import sklearn.model_selection import numpy +from config import DATA_HOME -__all__ = ['MNISTReader', 'train_reader_creator', 'test_reader_creator'] +__all__ = ['MNIST', 'train_creator', 'test_creator'] -DATA_HOME = None +def __mnist_reader_creator__(data, target): + def reader(): + n_samples = data.shape[0] + for i in xrange(n_samples): + yield (data[i] / 255.0).astype(numpy.float32), int(target[i]) -def __mnist_reader__(data, target): - n_samples = data.shape[0] - for i in xrange(n_samples): - yield data[i].astype(numpy.float32), int(target[i]) + return reader -class MNISTReader(object): +class MNIST(object): """ mnist dataset reader. The `train_reader` and `test_reader` method returns a iterator of each sample. Each sample is combined by 784-dim float and a one-dim label """ - def __init__(self, random_state): + def __init__(self, random_state=0, test_size=10000, **options): data = sklearn.datasets.mldata.fetch_mldata( "MNIST original", data_home=DATA_HOME) - n_train = 60000 self.X_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split( - data.data / 255.0, - data.target.astype("int"), - train_size=n_train, - random_state=random_state) + data.data, + data.target, + test_size=test_size, + random_state=random_state, + **options) - def train_reader(self): - return __mnist_reader__(self.X_train, self.y_train) + def train_creator(self): + return __mnist_reader_creator__(self.X_train, self.y_train) - def test_reader(self): - return __mnist_reader__(self.X_test, self.y_test) + def test_creator(self): + return __mnist_reader_creator__(self.X_test, self.y_test) -__default_instance__ = MNISTReader(0) - - -def train_reader_creator(): - """ - Default train set reader creator. - """ - return __default_instance__.train_reader - - -def test_reader_creator(): - """ - Default test set reader creator. - """ - return __default_instance__.test_reader +__default_instance__ = MNIST() +train_creator = __default_instance__.train_creator +test_creator = __default_instance__.test_creator def unittest(): - assert len(list(train_reader_creator()())) == 60000 + size = 12045 + mnist = MNIST(test_size=size) + assert len(list(mnist.test_creator()())) == size if __name__ == '__main__': From a6028d79dcaba69f6f95c7ebf9c12c33ad42b82e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 27 Feb 2017 10:39:17 +0800 Subject: [PATCH 5/5] Clean mnist reader --- python/paddle/v2/data_set/mnist.py | 35 +++++++++--------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/python/paddle/v2/data_set/mnist.py b/python/paddle/v2/data_set/mnist.py index 6f35acf683..4b392af400 100644 --- a/python/paddle/v2/data_set/mnist.py +++ b/python/paddle/v2/data_set/mnist.py @@ -15,39 +15,24 @@ def __mnist_reader_creator__(data, target): return reader -class MNIST(object): - """ - mnist dataset reader. The `train_reader` and `test_reader` method returns - a iterator of each sample. Each sample is combined by 784-dim float and a - one-dim label - """ +TEST_SIZE = 10000 - def __init__(self, random_state=0, test_size=10000, **options): - data = sklearn.datasets.mldata.fetch_mldata( - "MNIST original", data_home=DATA_HOME) - self.X_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split( - data.data, - data.target, - test_size=test_size, - random_state=random_state, - **options) +data = sklearn.datasets.mldata.fetch_mldata( + "MNIST original", data_home=DATA_HOME) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + data.data, data.target, test_size=TEST_SIZE, random_state=0) - def train_creator(self): - return __mnist_reader_creator__(self.X_train, self.y_train) - def test_creator(self): - return __mnist_reader_creator__(self.X_test, self.y_test) +def train_creator(): + return __mnist_reader_creator__(X_train, y_train) -__default_instance__ = MNIST() -train_creator = __default_instance__.train_creator -test_creator = __default_instance__.test_creator +def test_creator(): + return __mnist_reader_creator__(X_test, y_test) def unittest(): - size = 12045 - mnist = MNIST(test_size=size) - assert len(list(mnist.test_creator()())) == size + assert len(list(test_creator()())) == TEST_SIZE if __name__ == '__main__':