From d2dfa70deb08c3c8b13e2154afed6a3e4ce535d7 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 17 Feb 2017 19:34:11 +0800 Subject: [PATCH 01/11] data converter --- python/paddle/v2/data_converter.py | 240 +++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 python/paddle/v2/data_converter.py diff --git a/python/paddle/v2/data_converter.py b/python/paddle/v2/data_converter.py new file mode 100644 index 0000000000..5d7b8a736b --- /dev/null +++ b/python/paddle/v2/data_converter.py @@ -0,0 +1,240 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import py_paddle.swig_paddle +import numpy + +__all__ = ['DataConverter'] + + +class IDataConverter(object): + def __init__(self, input_type, pos): + """ + :param input_type: data type + :type input_type: dp2.InputType + :param pos: which input, start from 0 + :type pos: int + """ + self.input_type = input_type + assert isinstance(self.input_type, dp2.InputType) + self.pos = pos + + def convert(self, data, argument): + """ + Conv data to paddle format. + :param data: input data + :param argument: paddle format + """ + pass + + +class DenseConvert(IDataConverter): + def __init__(self, input_type, pos): + IDataConverter.__init__(self, input_type, pos) + + def convert(self, data, argument): + """ + :param data: input data + :type data: list | numpy array + :param argument: the type which paddle is acceptable + :type argument: swig_paddle.Arguments + """ + assert isinstance(argument, swig_paddle.Arguments) + if data.dtype != numpy.float32: + data = data.astype(numpy.float32) + m = swig_paddle.Matrix.createDenseFromNumpy(data, True, False) + argument.setSlotValue(self.pos, m) + + +class SparseBinaryConvert(IDataConverter): + def __init__(self, input_type, pos): + IDataConverter.__init__(self, input_type, pos) + self.__rows__ = [0] + self.__cols__ = [] + self.__height__ = 0 + self.__nnz__ = 0 + self.__value__ = [] + + def fill_csr(self, data): + self.__height__ = len(data) + for x in data: + self.__rows__.append(self.__rows__[-1] + len(x)) + self__cols__ = data.flatten() + + def convert(self, data, argument): + assert isinstance(argument, swig_paddle.Arguments) + + fill_csr(data) + m = swig_paddle.Matrix.createSparse(self.__height__, + self.input_type.dim, + len(self.__cols__), + len(self.__value__) == 0) + assert isinstance(m, swig_paddle.Matrix) + m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__) + argument.setSlotValue(self.pos, m) + + +class SparseFloatConvert(SparseBinaryConvert): + def __init__(self, input_type, pos): + SparseBinaryConvert.__init__(self, input_type, pos) + + def fill_csr(self, data): + self.__height__ = len(data) + for x in data: + self.__rows__.append(self.__rows__[-1] + len(x)) + self.__cols__.extend((x[0] for x in data)) + self.__value__.extend((x[1] for x in data)) + + +class IndexConvert(IDataConverter): + def __init__(self, input_type, pos): + IDataConverter.__init__(self, input_type, pos) + self.__ids__ = [] + + def convert(self, data, argument): + assert isinstance(argument, swig_paddle.Arguments) + self.__ids__ = data.flatten() + ids = swig_paddle.IVector.create(self.__ids__) + argument.setSlotIds(self.pos, ids) + + +class SequenceConvert(IDataConverter): + def __init__(self, input_type, pos, inner_convert, setter): + """ + :param input_type: the type of input data + :type input_type: dp2.InputType + :param pos: the position of this input + :type pos: int + :param inner_convert: DataConvert type + :type inner_convert: DenseConvert|SparseBinaryConvert| + SparseFloatConvert|IndexConvert + :param setter: + :type setter: + """ + IDataConverter.__init__(self, input_type, pos) + self.__seq__ = [0] + self.__inner_convert__ = inner_convert + self.__setter__ = setter + + def fill_seq(self, data): + for each in data: + self.__seq__.append(self.__seq__[-1] + self.get_size(each)) + + def convert(self, data, argument): + fill_seq(data) + seq = swig_paddle.IVector.create(self.__seq__, False) + self.__setter__(argument, self.pos, seq) + + dat = [] + for each in data: + dat.append(each) + self.__inner_scanner__.convert(dat, argument) + + def get_size(self, data): + if isinstance(self.__inner_scanner__, SequenceConvert): + return sum(self.__inner_scanner__.get_size(item) for item in dat) + else: + return len(data) + + +class DataConverter(object): + def __init__(self, input_mapper): + """ + Usege: + + .. code-block:: python + inputs = [('image', dense_vector), ('label', integer_value)] + cvt = DataConverter(inputs) + arg = cvt.convert(minibatch_data, {'image':0, 'label':1}) + + :param input_mapper: list of (input_name, input_type) + :type input_mapper: list + """ + assert isinstance(self.input_types, collections.Sequence) + self.input_names = [] + self.input_types = [] + for each in self.input_types: + self.input_names.append(each[0]) + self.input_types.append(each[1]) + assert isinstance(each[1], dp2.InputType) + + def convert(self, data, input_dict=None, argument=None): + """ + Convert minibatch data to Paddle's argument. The data is numpy array + or list. + + :param data: input samples, for example, [column0, column1, ...] or + (column0, column1, ...) each column is one minibatch + feature. Note, if only one column featrue, data also + shuld be a list or tupe, [column0] or (column0). + :type data: list|tuple + :param input_dict: a dictionary to specify the correspondence + of data_layer and input data. If None, + the feature order in argument and data is the same. + :type input_dict: dict, like {string:integer, string, integer, ...}|None + :param argument: converted data will be saved in this argument. If None, + it will create a swig_paddle.Arguments firstly. + :param type: swig_paddle.Arguments|None + """ + if argument is None: + argument = swig_paddle.Arguments.createArguments(0) + assert isinstance(argument, swig_paddle.Arguments) + argument.resize(len(self.input_types)) + + converts = [ + DataConverter.create_scanner(i, each_type) + for i, each_type in enumerate(self.input_types) + ] + + for i, cvt in enumerate(converts): + if input_dict is not None: + dat = data[input_dict[self.input_names[i]]] + else: + dat = data[i] + cvt.convert(dat, argument) + + return argument + + def __call__(self, dat, argument=None): + return self.convert(dat, argument) + + @staticmethod + def create_scanner(pos, each): + assert isinstance(each, dp2.InputType) + retv = None + if each.type == dp2.DataType.Dense: + retv = DenseConvert(each, pos) + elif each.type == dp2.DataType.Index: + retv = IndexConvert(each, pos) + elif each.type == dp2.DataType.SparseNonValue: + retv = SparseBinaryConvert(each, pos) + elif each.type == dp2.DataType.SparseValue: + retv = SparseFloatConvert(each, pos) + assert retv is not None + + if each.seq_type == dp2.SequenceType.SUB_SEQUENCE: + retv = SequenceConvert( + each, pos, retv, + lambda arg, pos, seq: arg.setSlotSubSequenceStartPositions(pos, seq) + ) + + if each.seq_type in [ + dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE + ]: + retv = SequenceConvert( + each, pos, retv, + lambda arg, pos, seq: arg.setSlotSequenceStartPositions(pos, seq) + ) + return retv From 733da9b9e62fb20a5adfe12f23834b7fa184dd63 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 17 Feb 2017 19:45:52 +0800 Subject: [PATCH 02/11] data converter --- python/paddle/v2/data_converter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/v2/data_converter.py b/python/paddle/v2/data_converter.py index 5d7b8a736b..45114b407d 100644 --- a/python/paddle/v2/data_converter.py +++ b/python/paddle/v2/data_converter.py @@ -15,6 +15,7 @@ import collections import py_paddle.swig_paddle import numpy +import paddle.trainer.PyDataProvider2 as dp2 __all__ = ['DataConverter'] From e6232d82e1650dc2186fa39c93a06a7ef276fc52 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Sun, 19 Feb 2017 18:22:33 +0800 Subject: [PATCH 03/11] testing in mnist --- python/paddle/v2/__init__.py | 3 +- python/paddle/v2/data_converter.py | 50 ++++++++++++++---------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index bc064a21ae..4ecd0dafd6 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -20,7 +20,8 @@ import event import py_paddle.swig_paddle as api __all__ = [ - 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', 'event' + 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', + 'event', 'data_converter' ] diff --git a/python/paddle/v2/data_converter.py b/python/paddle/v2/data_converter.py index 45114b407d..afb98a77c5 100644 --- a/python/paddle/v2/data_converter.py +++ b/python/paddle/v2/data_converter.py @@ -13,8 +13,8 @@ # limitations under the License. import collections -import py_paddle.swig_paddle -import numpy +import py_paddle.swig_paddle as api +import numpy as np import paddle.trainer.PyDataProvider2 as dp2 __all__ = ['DataConverter'] @@ -50,12 +50,12 @@ class DenseConvert(IDataConverter): :param data: input data :type data: list | numpy array :param argument: the type which paddle is acceptable - :type argument: swig_paddle.Arguments + :type argument: Paddle's Arguments """ - assert isinstance(argument, swig_paddle.Arguments) - if data.dtype != numpy.float32: - data = data.astype(numpy.float32) - m = swig_paddle.Matrix.createDenseFromNumpy(data, True, False) + assert isinstance(argument, api.Arguments) + if data.dtype != np.float32: + data = data.astype(np.float32) + m = api.Matrix.createDenseFromNumpy(data, True, False) argument.setSlotValue(self.pos, m) @@ -72,17 +72,16 @@ class SparseBinaryConvert(IDataConverter): self.__height__ = len(data) for x in data: self.__rows__.append(self.__rows__[-1] + len(x)) - self__cols__ = data.flatten() + self.__cols__ = data.flatten() def convert(self, data, argument): - assert isinstance(argument, swig_paddle.Arguments) + assert isinstance(argument, api.Arguments) fill_csr(data) - m = swig_paddle.Matrix.createSparse(self.__height__, - self.input_type.dim, - len(self.__cols__), - len(self.__value__) == 0) - assert isinstance(m, swig_paddle.Matrix) + m = api.Matrix.createSparse(self.__height__, self.input_type.dim, + len(self.__cols__), + len(self.__value__) == 0) + assert isinstance(m, api.Matrix) m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__) argument.setSlotValue(self.pos, m) @@ -105,9 +104,9 @@ class IndexConvert(IDataConverter): self.__ids__ = [] def convert(self, data, argument): - assert isinstance(argument, swig_paddle.Arguments) + assert isinstance(argument, api.Arguments) self.__ids__ = data.flatten() - ids = swig_paddle.IVector.create(self.__ids__) + ids = api.IVector.create(self.__ids__) argument.setSlotIds(self.pos, ids) @@ -135,7 +134,7 @@ class SequenceConvert(IDataConverter): def convert(self, data, argument): fill_seq(data) - seq = swig_paddle.IVector.create(self.__seq__, False) + seq = api.IVector.create(self.__seq__, False) self.__setter__(argument, self.pos, seq) dat = [] @@ -151,22 +150,21 @@ class SequenceConvert(IDataConverter): class DataConverter(object): - def __init__(self, input_mapper): + def __init__(self, input): """ Usege: .. code-block:: python inputs = [('image', dense_vector), ('label', integer_value)] cvt = DataConverter(inputs) - arg = cvt.convert(minibatch_data, {'image':0, 'label':1}) + arg = cvt(minibatch_data, {'image':0, 'label':1}) :param input_mapper: list of (input_name, input_type) :type input_mapper: list """ - assert isinstance(self.input_types, collections.Sequence) self.input_names = [] self.input_types = [] - for each in self.input_types: + for each in input: self.input_names.append(each[0]) self.input_types.append(each[1]) assert isinstance(each[1], dp2.InputType) @@ -186,16 +184,16 @@ class DataConverter(object): the feature order in argument and data is the same. :type input_dict: dict, like {string:integer, string, integer, ...}|None :param argument: converted data will be saved in this argument. If None, - it will create a swig_paddle.Arguments firstly. + it will create a Paddle's Arguments firstly. :param type: swig_paddle.Arguments|None """ if argument is None: - argument = swig_paddle.Arguments.createArguments(0) - assert isinstance(argument, swig_paddle.Arguments) + argument = api.Arguments.createArguments(0) + assert isinstance(argument, api.Arguments) argument.resize(len(self.input_types)) converts = [ - DataConverter.create_scanner(i, each_type) + DataConverter.create_converter(i, each_type) for i, each_type in enumerate(self.input_types) ] @@ -212,7 +210,7 @@ class DataConverter(object): return self.convert(dat, argument) @staticmethod - def create_scanner(pos, each): + def create_converter(pos, each): assert isinstance(each, dp2.InputType) retv = None if each.type == dp2.DataType.Dense: From 67b8150ff4d04552a5a52cb099bf7e935765e69f Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 21 Feb 2017 13:27:21 +0800 Subject: [PATCH 04/11] data converter test --- paddle/data_converter_test.py | 92 +++++++++++++++++++++++++ python/paddle/v2/data_converter.py | 19 ++--- python/paddle/v2/data_converter_test.py | 92 +++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 8 deletions(-) create mode 100644 paddle/data_converter_test.py create mode 100644 python/paddle/v2/data_converter_test.py diff --git a/paddle/data_converter_test.py b/paddle/data_converter_test.py new file mode 100644 index 0000000000..d84ee51727 --- /dev/null +++ b/paddle/data_converter_test.py @@ -0,0 +1,92 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import py_paddle.swig_paddle as api +import numpy as np +import paddle.trainer.PyDataProvider2 as dp2 + +from paddle.v2.data_converter import DataConverter + + +class DataConverterTest(unittest.TestCase): + def dense_reader(self, shape): + data = np.random.random(shape) + return data + + def sparse_binary_reader(self, + high, + size_limit, + batch_size, + non_empty=False): + data = [] + for i in xrange(batch_size): + num = np.random.randint(size_limit) # num could be 0 + while non_empty and num == 0: + num = np.random.randint(size_limit) + data.append(np.random.randint(high, size=num).tolist()) + + return data + + def test_dense_vector(self): + def compare(input): + converter = DataConverter([('image', dp2.dense_vector(784))]) + arg = converter([input], {'image': 0}) + output = arg.getSlotValue(0).copyToNumpyMat() + input = np.array(input, dtype='float32') + self.assertAlmostEqual(input.all(), output.all()) + + # test numpy array + data = self.dense_reader(shape=[32, 784]) + compare(data) + + # test list + compare(data.tolist()) + + #def test_sparse_binary(self): + # dim = 100000 + # data = self.sparse_binary_reader(dim, 5, 2) + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + #def test_sparse(self): + # dim = 100000 + # v = self.sparse_binary_reader(dim, 5, 2) + # w = [] + # for dat in data: + # x = self.dense_reader(shape=[1, len(dat)]) + # w.append(x.tolist()) + # data = [] + # for each in zip(v, w): + # data.append(zip(each[0], each[1])) + # + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + def test_integer(self): + dim = 100 + index = np.random.randint(dim, size=32) + print index + converter = DataConverter([('input', dp2.integer_value(dim))]) + arg = converter([index], {'input': 0}) + print arg.getSlotValue(0) + output = arg.getSlotValue(0).copyToNumpyArray() + print 'output=', output + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/data_converter.py b/python/paddle/v2/data_converter.py index afb98a77c5..fcba43e4ba 100644 --- a/python/paddle/v2/data_converter.py +++ b/python/paddle/v2/data_converter.py @@ -53,9 +53,9 @@ class DenseConvert(IDataConverter): :type argument: Paddle's Arguments """ assert isinstance(argument, api.Arguments) - if data.dtype != np.float32: - data = data.astype(np.float32) - m = api.Matrix.createDenseFromNumpy(data, True, False) + # TODO: handle data type (float, double, ...) + data = np.array(data, np.float32) + m = api.Matrix.createDenseFromNumpy(data) argument.setSlotValue(self.pos, m) @@ -72,12 +72,12 @@ class SparseBinaryConvert(IDataConverter): self.__height__ = len(data) for x in data: self.__rows__.append(self.__rows__[-1] + len(x)) - self.__cols__ = data.flatten() + self.__cols__.extend(x) def convert(self, data, argument): assert isinstance(argument, api.Arguments) - fill_csr(data) + self.fill_csr(data) m = api.Matrix.createSparse(self.__height__, self.input_type.dim, len(self.__cols__), len(self.__value__) == 0) @@ -94,8 +94,8 @@ class SparseFloatConvert(SparseBinaryConvert): self.__height__ = len(data) for x in data: self.__rows__.append(self.__rows__[-1] + len(x)) - self.__cols__.extend((x[0] for x in data)) - self.__value__.extend((x[1] for x in data)) + self.__cols__.extend(x[0]) + self.__value__.extend(x[1]) class IndexConvert(IDataConverter): @@ -105,7 +105,10 @@ class IndexConvert(IDataConverter): def convert(self, data, argument): assert isinstance(argument, api.Arguments) - self.__ids__ = data.flatten() + #for x in data: + # self.__ids__.append(x) + self.__ids__.extend(x) + ids = api.IVector.create(self.__ids__) argument.setSlotIds(self.pos, ids) diff --git a/python/paddle/v2/data_converter_test.py b/python/paddle/v2/data_converter_test.py new file mode 100644 index 0000000000..d84ee51727 --- /dev/null +++ b/python/paddle/v2/data_converter_test.py @@ -0,0 +1,92 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import py_paddle.swig_paddle as api +import numpy as np +import paddle.trainer.PyDataProvider2 as dp2 + +from paddle.v2.data_converter import DataConverter + + +class DataConverterTest(unittest.TestCase): + def dense_reader(self, shape): + data = np.random.random(shape) + return data + + def sparse_binary_reader(self, + high, + size_limit, + batch_size, + non_empty=False): + data = [] + for i in xrange(batch_size): + num = np.random.randint(size_limit) # num could be 0 + while non_empty and num == 0: + num = np.random.randint(size_limit) + data.append(np.random.randint(high, size=num).tolist()) + + return data + + def test_dense_vector(self): + def compare(input): + converter = DataConverter([('image', dp2.dense_vector(784))]) + arg = converter([input], {'image': 0}) + output = arg.getSlotValue(0).copyToNumpyMat() + input = np.array(input, dtype='float32') + self.assertAlmostEqual(input.all(), output.all()) + + # test numpy array + data = self.dense_reader(shape=[32, 784]) + compare(data) + + # test list + compare(data.tolist()) + + #def test_sparse_binary(self): + # dim = 100000 + # data = self.sparse_binary_reader(dim, 5, 2) + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + #def test_sparse(self): + # dim = 100000 + # v = self.sparse_binary_reader(dim, 5, 2) + # w = [] + # for dat in data: + # x = self.dense_reader(shape=[1, len(dat)]) + # w.append(x.tolist()) + # data = [] + # for each in zip(v, w): + # data.append(zip(each[0], each[1])) + # + # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) + # arg = converter([data], {'input':0}) + # output = arg.getSlotValue(0) + + def test_integer(self): + dim = 100 + index = np.random.randint(dim, size=32) + print index + converter = DataConverter([('input', dp2.integer_value(dim))]) + arg = converter([index], {'input': 0}) + print arg.getSlotValue(0) + output = arg.getSlotValue(0).copyToNumpyArray() + print 'output=', output + + +if __name__ == '__main__': + unittest.main() From 15180e85acaa400c629a37fadcf4589b7c086c7d Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 21 Feb 2017 15:50:01 +0800 Subject: [PATCH 05/11] remove some code --- paddle/data_converter_test.py | 92 ----------------------------------- 1 file changed, 92 deletions(-) delete mode 100644 paddle/data_converter_test.py diff --git a/paddle/data_converter_test.py b/paddle/data_converter_test.py deleted file mode 100644 index d84ee51727..0000000000 --- a/paddle/data_converter_test.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import py_paddle.swig_paddle as api -import numpy as np -import paddle.trainer.PyDataProvider2 as dp2 - -from paddle.v2.data_converter import DataConverter - - -class DataConverterTest(unittest.TestCase): - def dense_reader(self, shape): - data = np.random.random(shape) - return data - - def sparse_binary_reader(self, - high, - size_limit, - batch_size, - non_empty=False): - data = [] - for i in xrange(batch_size): - num = np.random.randint(size_limit) # num could be 0 - while non_empty and num == 0: - num = np.random.randint(size_limit) - data.append(np.random.randint(high, size=num).tolist()) - - return data - - def test_dense_vector(self): - def compare(input): - converter = DataConverter([('image', dp2.dense_vector(784))]) - arg = converter([input], {'image': 0}) - output = arg.getSlotValue(0).copyToNumpyMat() - input = np.array(input, dtype='float32') - self.assertAlmostEqual(input.all(), output.all()) - - # test numpy array - data = self.dense_reader(shape=[32, 784]) - compare(data) - - # test list - compare(data.tolist()) - - #def test_sparse_binary(self): - # dim = 100000 - # data = self.sparse_binary_reader(dim, 5, 2) - # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) - # arg = converter([data], {'input':0}) - # output = arg.getSlotValue(0) - - #def test_sparse(self): - # dim = 100000 - # v = self.sparse_binary_reader(dim, 5, 2) - # w = [] - # for dat in data: - # x = self.dense_reader(shape=[1, len(dat)]) - # w.append(x.tolist()) - # data = [] - # for each in zip(v, w): - # data.append(zip(each[0], each[1])) - # - # converter = DataConverter([('input', dp2.sparse_binary_vector(dim))]) - # arg = converter([data], {'input':0}) - # output = arg.getSlotValue(0) - - def test_integer(self): - dim = 100 - index = np.random.randint(dim, size=32) - print index - converter = DataConverter([('input', dp2.integer_value(dim))]) - arg = converter([index], {'input': 0}) - print arg.getSlotValue(0) - output = arg.getSlotValue(0).copyToNumpyArray() - print 'output=', output - - -if __name__ == '__main__': - unittest.main() From bb625337e37c06cec40954abb83fde8b2716d44a Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Wed, 22 Feb 2017 15:28:02 +0800 Subject: [PATCH 06/11] add some comments --- python/paddle/v2/data_feeder.py | 41 +++++++++++++++++++++++++++++++++ python/paddle/v2/trainer.py | 1 - 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index 2a0b6bbeb5..83a4efef9e 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -15,5 +15,46 @@ from py_paddle import DataProviderConverter __all__ = ['DataFeeder'] +""" +DataFeeder converts the data returned by paddle.reader into a data structure +of Arguments which is defined in the API. The paddle.reader usually returns +a list of mini-batch data. Each item in the list is a tuple or list, which is +one sample with multiple features. DataFeeder converts this mini-batch data +into Arguments in order to feed it to C++ interface. + +The example usage: + + data_types = [paddle.data_type.dense_vector(784), + paddle.data_type.integer_value(10)] + feeder = DataFeeder(input_types=data_types) + minibatch_data = [ + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample + ] + + # or + # minibatch_data = [ + # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample + # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample + # ] + arg = feeder(minibatch_data) + + +Args: + input_types: A list of input data types. It's length is equal to the length + of data returned by paddle.reader. Each item specifies the type + of each feature. + mintbatch_data: A list of mini-batch data. Each item is a list or tuple, + for example: + [ + (feature_0, feature_1, feature_2, ...), # first sample + (feature_0, feature_1, feature_2, ...), # second sample + ... + ] + +Returns: + An Arguments object contains this mini-batch data with multiple features. + The Arguments definition is in the API. +""" DataFeeder = DataProviderConverter diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 7480a3fb84..5709c7e886 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -89,7 +89,6 @@ class SGD(ITrainer): event_handler = default_event_handler topology = v2_layer.parse_network(topology) - print topology __check_train_args__(**locals()) From 84b423a89a2a6ece21910c83277f6282b80f6be7 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 23 Feb 2017 17:35:06 +0800 Subject: [PATCH 07/11] refine data feeder and add unit test --- python/CMakeLists.txt | 1 + python/paddle/v2/data_feeder.py | 110 ++++++++++----- python/paddle/v2/tests/CMakeLists.txt | 2 + python/paddle/v2/tests/run_tests.sh | 36 +++++ python/paddle/v2/tests/test_data_feeder.py | 150 +++++++++++++++++++++ 5 files changed, 264 insertions(+), 35 deletions(-) create mode 100644 python/paddle/v2/tests/CMakeLists.txt create mode 100755 python/paddle/v2/tests/run_tests.sh create mode 100644 python/paddle/v2/tests/test_data_feeder.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 357637e203..71af50a9a4 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,6 +25,7 @@ add_custom_target(paddle_python ALL DEPENDS add_subdirectory(paddle/trainer_config_helpers/tests) add_subdirectory(paddle/reader/tests) +add_subdirectory(paddle/v2/tests) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/ DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index 83a4efef9e..b594643dda 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -12,49 +12,89 @@ # See the License for the specific language governing permissions and # limitations under the License. +from py_paddle import swig_paddle from py_paddle import DataProviderConverter +import data_type __all__ = ['DataFeeder'] -""" -DataFeeder converts the data returned by paddle.reader into a data structure -of Arguments which is defined in the API. The paddle.reader usually returns -a list of mini-batch data. Each item in the list is a tuple or list, which is -one sample with multiple features. DataFeeder converts this mini-batch data -into Arguments in order to feed it to C++ interface. - -The example usage: - - data_types = [paddle.data_type.dense_vector(784), - paddle.data_type.integer_value(10)] - feeder = DataFeeder(input_types=data_types) - minibatch_data = [ - ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample - ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample - ] - - # or - # minibatch_data = [ - # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample - # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample - # ] - arg = feeder(minibatch_data) - - -Args: - input_types: A list of input data types. It's length is equal to the length - of data returned by paddle.reader. Each item specifies the type - of each feature. - mintbatch_data: A list of mini-batch data. Each item is a list or tuple, + + +class DataFeeder(DataProviderConverter): + """ + DataFeeder converts the data returned by paddle.reader into a data structure + of Arguments which is defined in the API. The paddle.reader usually returns + a list of mini-batch data. Each item in the list is a list or a tuple, + which is one sample with one or multiple features. DataFeeder converts this + mini-batch data into Arguments in order to feed it to C++ interface. + + The example usage: + + data_types = [('image', paddle.data_type.dense_vector(784)), + ('label', paddle.data_type.integer_value(10))] + reader_dict = {'image':0, 'label':1} + feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict) + minibatch_data = [ + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample + ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample + ] + arg = feeder(minibatch_data) + """ + + def __init__(self, data_types, reader_dict): + """ + :param data_types: A list to specify data name and type. Each item is + a tuple of (data_name, data_type). For example: + [('image', paddle.data_type.dense_vector(784)), + ('label', paddle.data_type.integer_value(10))] + + :type data_types: A list of tuple + :param reader_dict: A dictionary to specify the position of each data + in the input data. + :type reader_dict: dict() + """ + self.input_names = [] + self.input_types = [] + self.reader_dict = reader_dict + for each in data_types: + self.input_names.append(each[0]) + self.input_types.append(each[1]) + assert isinstance(each[1], data_type.InputType) + DataProviderConverter.__init__(self, self.input_types) + + def convert(self, dat, argument=None): + """ + :param dat: A list of mini-batch data. Each item is a list or tuple, for example: [ (feature_0, feature_1, feature_2, ...), # first sample (feature_0, feature_1, feature_2, ...), # second sample ... ] + :type dat: List + :param argument: An Arguments object contains this mini-batch data with + one or multiple features. The Arguments definition is + in the API. + :type argument: swig_paddle.Arguments + """ + + if argument is None: + argument = swig_paddle.Arguments.createArguments(0) + assert isinstance(argument, swig_paddle.Arguments) + argument.resize(len(self.input_types)) + + scanners = [ + DataProviderConverter.create_scanner(i, each_type) + for i, each_type in enumerate(self.input_types) + ] + + for each_sample in dat: + for name, scanner in zip(self.input_names, scanners): + scanner.scan(each_sample[self.reader_dict[name]]) + + for scanner in scanners: + scanner.finish_scan(argument) -Returns: - An Arguments object contains this mini-batch data with multiple features. - The Arguments definition is in the API. -""" + return argument -DataFeeder = DataProviderConverter + def __call__(self, dat, argument=None): + return self.convert(dat, argument) diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt new file mode 100644 index 0000000000..5842a716ca --- /dev/null +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +add_test(NAME test_v2_api + COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE}) diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh new file mode 100755 index 0000000000..b96f54fe9c --- /dev/null +++ b/python/paddle/v2/tests/run_tests.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +pushd `dirname $0` > /dev/null +SCRIPTPATH=$PWD +popd > /dev/null + +cd $SCRIPTPATH + +$1 -m pip install ../../../../paddle/dist/*.whl + +test_list="test_data_feeder.py" + +export PYTHONPATH=$PWD/../../../../python/ + +for fn in $test_list +do + echo "test $fn" + $1 $fn + if [ $? -ne 0 ]; then + exit 1 + fi +done diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py new file mode 100644 index 0000000000..dcf433d7d8 --- /dev/null +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -0,0 +1,150 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import py_paddle.swig_paddle as api +import numpy as np + +from paddle.v2 import data_type +from paddle.v2.data_feeder import DataFeeder + + +class DataFeederTest(unittest.TestCase): + def dense_reader(self, size): + data = np.random.random(size) + return data + + def sparse_binary_reader(self, high, size_limit, non_empty=False): + num = np.random.randint(size_limit) # num could be 0 + while non_empty and num == 0: + num = np.random.randint(size_limit) + return np.random.randint(high, size=num).tolist() + + def test_dense_vector(self): + def compare(input): + feeder = DataFeeder([('image', data_type.dense_vector(784))], + {'image': 0}) + arg = feeder([input]) + output = arg.getSlotValue(0).copyToNumpyMat() + input = np.array(input, dtype='float32') + self.assertAlmostEqual(input.all(), output.all()) + + # test numpy array + batch_size = 32 + dim = 784 + data = [] + for i in xrange(batch_size): + data.append(self.dense_reader(784)) + compare(data) + + # test list + data = [] + for i in xrange(batch_size): + data.append(self.dense_reader(784).tolist()) + compare(data) + + def test_sparse_binary(self): + dim = 10000 + batch_size = 32 + data = [] + for i in xrange(batch_size): + data.append([self.sparse_binary_reader(dim, 50)]) + feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))], + {'input': 0}) + arg = feeder(data) + output = arg.getSlotValue(0) + assert isinstance(output, api.Matrix) + for i in xrange(batch_size): + self.assertEqual(output.getSparseRowCols(i), data[i][0]) + + def test_sparse(self): + dim = 10000 + batch_size = 32 + v = [] + w = [] + data = [] + for dat in xrange(batch_size): + a = self.sparse_binary_reader(dim, 40, non_empty=True) + b = self.dense_reader(len(a)).tolist() + v.append(a) + w.append(b[0]) + data.append([zip(a, b)]) + + feeder = DataFeeder([('input', data_type.sparse_vector(dim))], + {'input': 0}) + arg = feeder(data) + output = arg.getSlotValue(0) + assert isinstance(output, api.Matrix) + for i in xrange(batch_size): + self.assertEqual(output.getSparseRowCols(i), v[i]) + + def test_integer(self): + dim = 100 + batch_size = 32 + index = [] + for i in xrange(batch_size): + index.append([np.random.randint(dim)]) + feeder = DataFeeder([('input', data_type.integer_value(dim))], + {'input': 0}) + arg = feeder(index) + output = arg.getSlotIds(0).copyToNumpyArray() + index = np.array(index, dtype='int') + self.assertEqual(output.all(), index.flatten().all()) + + def test_multiple_slots(self): + batch_size = 2 + data = [] + for i in xrange(batch_size): + each_sample = [] + each_sample.append(np.random.randint(10)) # size of feature 2: 10 + each_sample.append( + self.sparse_binary_reader( + 20000, 40, non_empty=True)) # size of feature 1: 20000 + each_sample.append(self.dense_reader(100)) # size of feature 0: 100 + data.append(each_sample) + + # test multiple features + data_types = [('fea0', data_type.dense_vector(100)), + ('fea1', data_type.sparse_binary_vector(20000)), + ('fea2', data_type.integer_value(10))] + feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0}) + arg = feeder(data) + output_dense = arg.getSlotValue(0).copyToNumpyMat() + output_sparse = arg.getSlotValue(1) + output_index = arg.getSlotIds(2).copyToNumpyArray() + for i in xrange(batch_size): + self.assertEqual(output_dense[i].all(), data[i][2].all()) + self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1]) + self.assertEqual(output_index[i], data[i][0]) + + # reader returns 3 featreus, but only use 2 features + data_types = [('fea0', data_type.dense_vector(100)), + ('fea2', data_type.integer_value(10))] + feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0}) + arg = feeder(data) + output_dense = arg.getSlotValue(0).copyToNumpyMat() + output_index = arg.getSlotIds(1).copyToNumpyArray() + for i in xrange(batch_size): + self.assertEqual(output_dense[i].all(), data[i][2].all()) + self.assertEqual(output_index[i], data[i][0]) + + +if __name__ == '__main__': + api.initPaddle("--use_gpu=0") + unittest.main() + +if __name__ == '__main__': + api.initPaddle("--use_gpu=0") + unittest.main() From f3c7fbeec4e256585dcf36e08fc2c06da243a045 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 23 Feb 2017 22:05:59 +0800 Subject: [PATCH 08/11] make minst to run --- demo/mnist/api_train_v2.py | 9 +++++---- python/paddle/v2/__init__.py | 1 + python/paddle/v2/tests/test_data_feeder.py | 22 ++++++++++++++++------ python/paddle/v2/trainer.py | 11 +++-------- 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 6fc01ce58b..650bf392bb 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -50,11 +50,12 @@ def main(): parameters=parameters, event_handler=event_handler, batch_size=32, # batch size should be refactor in Data reader - data_types={ # data_types will be removed, It should be in + data_types=[ # data_types will be removed, It should be in # network topology - 'pixel': images.type, - 'label': label.type - }) + ('pixel', images.type), + ('label', label.type)], + reader_dict={'pixel':0, 'label':1} + ) if __name__ == '__main__': diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index be752731ba..bf06b5a7e3 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -18,6 +18,7 @@ import parameters import trainer import event import data_type +import data_feeder import py_paddle.swig_paddle as api __all__ = [ diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py index dcf433d7d8..95a59a5d97 100644 --- a/python/paddle/v2/tests/test_data_feeder.py +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -36,7 +36,7 @@ class DataFeederTest(unittest.TestCase): def compare(input): feeder = DataFeeder([('image', data_type.dense_vector(784))], {'image': 0}) - arg = feeder([input]) + arg = feeder(input) output = arg.getSlotValue(0).copyToNumpyMat() input = np.array(input, dtype='float32') self.assertAlmostEqual(input.all(), output.all()) @@ -46,13 +46,17 @@ class DataFeederTest(unittest.TestCase): dim = 784 data = [] for i in xrange(batch_size): - data.append(self.dense_reader(784)) + each_sample = [] + each_sample.append(self.dense_reader(dim)) + data.append(each_sample) compare(data) # test list data = [] for i in xrange(batch_size): - data.append(self.dense_reader(784).tolist()) + each_sample = [] + each_sample.append(self.dense_reader(dim).tolist()) + data.append(each_sample) compare(data) def test_sparse_binary(self): @@ -60,7 +64,9 @@ class DataFeederTest(unittest.TestCase): batch_size = 32 data = [] for i in xrange(batch_size): - data.append([self.sparse_binary_reader(dim, 50)]) + each_sample = [] + each_sample.append(self.sparse_binary_reader(dim, 50)) + data.append(each_sample) feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))], {'input': 0}) arg = feeder(data) @@ -76,11 +82,13 @@ class DataFeederTest(unittest.TestCase): w = [] data = [] for dat in xrange(batch_size): + each_sample = [] a = self.sparse_binary_reader(dim, 40, non_empty=True) b = self.dense_reader(len(a)).tolist() v.append(a) w.append(b[0]) - data.append([zip(a, b)]) + each_sample.append(zip(a, b)) + data.append(each_sample) feeder = DataFeeder([('input', data_type.sparse_vector(dim))], {'input': 0}) @@ -95,7 +103,9 @@ class DataFeederTest(unittest.TestCase): batch_size = 32 index = [] for i in xrange(batch_size): - index.append([np.random.randint(dim)]) + each_sample = [] + each_sample.append(np.random.randint(dim)) + index.append(each_sample) feeder = DataFeeder([('input', data_type.integer_value(dim))], {'input': 0}) arg = feeder(index) diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 5709c7e886..023ab5e42d 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -69,7 +69,8 @@ class SGD(ITrainer): test_data_reader=None, event_handler=None, batch_size=32, - data_types=None): + data_types=None, + reader_dict=None): """ Training method. Will train num_passes of input data. @@ -103,13 +104,7 @@ class SGD(ITrainer): gm.start() out_args = api.Arguments.createArguments(0) - data_types_lists = [] - for each in topology.input_layer_names: - if each not in data_types: - raise ValueError() - data_types_lists.append(data_types[each]) - - feeder = DataFeeder(input_types=data_types_lists) + feeder = DataFeeder(data_types, reader_dict) for pass_id in xrange(num_passes): updater.startPass() From bb7db754208a7484ced25eb879bd77e7f6fae6c9 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Sat, 25 Feb 2017 10:07:15 +0800 Subject: [PATCH 09/11] add testing for duplicate item --- python/paddle/v2/tests/test_data_feeder.py | 23 +++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py index 4d5df6e893..5f67da6a5b 100644 --- a/python/paddle/v2/tests/test_data_feeder.py +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -176,7 +176,7 @@ class DataFeederTest(unittest.TestCase): self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1]) self.assertEqual(output_index[i], data[i][0]) - # reader returns 3 featreus, but only use 2 features + # reader returns 3 features, but only use 2 features data_types = [('fea0', data_type.dense_vector(100)), ('fea2', data_type.integer_value(10))] feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0}) @@ -187,6 +187,27 @@ class DataFeederTest(unittest.TestCase): self.assertEqual(output_dense[i].all(), data[i][2].all()) self.assertEqual(output_index[i], data[i][0]) + # reader returns 3 featreus, one is duplicate data + data_types = [('fea0', data_type.dense_vector(100)), + ('fea1', data_type.sparse_binary_vector(20000)), + ('fea2', data_type.integer_value(10)), + ('fea3', data_type.dense_vector(100))] + feeder = DataFeeder(data_types, + {'fea0': 2, + 'fea1': 1, + 'fea2': 0, + 'fea3': 2}) + arg = feeder(data) + fea0 = arg.getSlotValue(0).copyToNumpyMat() + fea1 = arg.getSlotValue(1) + fea2 = arg.getSlotIds(2).copyToNumpyArray() + fea3 = arg.getSlotValue(3).copyToNumpyMat() + for i in xrange(batch_size): + self.assertEqual(fea0[i].all(), data[i][2].all()) + self.assertEqual(fea1.getSparseRowCols(i), data[i][1]) + self.assertEqual(fea2[i], data[i][0]) + self.assertEqual(fea3[i].all(), data[i][2].all()) + def test_multiple_features_tuple(self): batch_size = 2 data = [] From 72c1327832b1b75057a19d98387546bb2f765ff4 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 27 Feb 2017 13:28:36 +0800 Subject: [PATCH 10/11] follow comments --- python/paddle/v2/data_feeder.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index 74ee112f46..632f1760ab 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -62,8 +62,8 @@ class DataFeeder(DataProviderConverter): self.reader_dict = reader_dict for each in data_types: self.input_names.append(each[0]) - self.input_types.append(each[1]) assert isinstance(each[1], data_type.InputType) + self.input_types.append(each[1]) DataProviderConverter.__init__(self, self.input_types) def convert(self, dat, argument=None): @@ -88,24 +88,16 @@ class DataFeeder(DataProviderConverter): :type argument: swig_paddle.Arguments """ - if argument is None: - argument = swig_paddle.Arguments.createArguments(0) - assert isinstance(argument, swig_paddle.Arguments) - argument.resize(len(self.input_types)) - - scanners = [ - DataProviderConverter.create_scanner(i, each_type) - for i, each_type in enumerate(self.input_types) - ] - - for each_sample in dat: - for name, scanner in zip(self.input_names, scanners): - scanner.scan(each_sample[self.reader_dict[name]]) - - for scanner in scanners: - scanner.finish_scan(argument) + def reorder_data(data): + retv = [] + for each in data: + reorder = [] + for name in self.input_names: + reorder.append(each[self.reader_dict[name]]) + retv.append(reorder) + return retv - return argument + return DataProviderConverter.convert(self, reorder_data(dat), argument) def __call__(self, dat, argument=None): return self.convert(dat, argument) From bc074d0e581bb0804d11246c3d87ccaa1a5abc50 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 27 Feb 2017 13:43:41 +0800 Subject: [PATCH 11/11] minor change --- python/paddle/v2/data_feeder.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index 632f1760ab..2a16d46dda 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -58,13 +58,13 @@ class DataFeeder(DataProviderConverter): :type reader_dict: dict() """ self.input_names = [] - self.input_types = [] + input_types = [] self.reader_dict = reader_dict for each in data_types: self.input_names.append(each[0]) assert isinstance(each[1], data_type.InputType) - self.input_types.append(each[1]) - DataProviderConverter.__init__(self, self.input_types) + input_types.append(each[1]) + DataProviderConverter.__init__(self, input_types) def convert(self, dat, argument=None): """ @@ -98,6 +98,3 @@ class DataFeeder(DataProviderConverter): return retv return DataProviderConverter.convert(self, reorder_data(dat), argument) - - def __call__(self, dat, argument=None): - return self.convert(dat, argument)