commit
297c6a018b
@ -0,0 +1,23 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# It would be too lengthy to require our users to prefix decorators with `decorator`.
|
||||
# For example, we want the following line
|
||||
#
|
||||
# r = paddle.reader.decorator.bufferd(paddle.reader.creator.text("hello.txt"))
|
||||
#
|
||||
# to be a shorter version:
|
||||
#
|
||||
# r = paddle.reader.buffered(paddle.reader.creator.text("hello.txt"))
|
||||
from decorator import *
|
@ -0,0 +1,60 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__all__ = ['buffered']
|
||||
|
||||
from Queue import Queue
|
||||
from threading import Thread
|
||||
|
||||
|
||||
def buffered(reader, size):
|
||||
"""Creates a buffered data reader.
|
||||
|
||||
The buffered data reader will read and save data entries into a buffer.
|
||||
Reading from the buffered data reader will proceed as long as the buffer
|
||||
is not empty.
|
||||
|
||||
Args:
|
||||
reader: the data reader to read from.
|
||||
size: max buffer size.
|
||||
|
||||
Returns:
|
||||
The buffered data reader.
|
||||
"""
|
||||
|
||||
class EndSignal():
|
||||
pass
|
||||
|
||||
end = EndSignal()
|
||||
|
||||
def read_worker(r, q):
|
||||
for d in r:
|
||||
q.put(d)
|
||||
q.put(end)
|
||||
|
||||
def create_reader():
|
||||
r = reader()
|
||||
q = Queue(maxsize=size)
|
||||
t = Thread(
|
||||
target=read_worker, args=(
|
||||
r,
|
||||
q, ))
|
||||
t.daemon = True
|
||||
t.start()
|
||||
e = q.get()
|
||||
while e != end:
|
||||
yield e
|
||||
e = q.get()
|
||||
|
||||
return create_reader
|
@ -0,0 +1,4 @@
|
||||
add_test(NAME reader_decorator_test
|
||||
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
|
||||
${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/reader/tests/decorator_test.py
|
||||
WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
|
@ -0,0 +1,50 @@
|
||||
# Copyright PaddlePaddle contributors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
import paddle.reader
|
||||
import time
|
||||
|
||||
|
||||
def reader_10(dur):
|
||||
for i in range(10):
|
||||
time.sleep(dur)
|
||||
yield i
|
||||
|
||||
|
||||
class TestBuffered(unittest.TestCase):
|
||||
def test_read(self):
|
||||
for size in range(20):
|
||||
b = paddle.reader.buffered(lambda: reader_10(0), size)
|
||||
c = 0
|
||||
for i in b():
|
||||
self.assertEqual(i, c)
|
||||
c += 1
|
||||
self.assertEqual(c, 10)
|
||||
|
||||
def test_buffering(self):
|
||||
# read have 30ms delay.
|
||||
b = paddle.reader.buffered(lambda: reader_10(0.03), 10)
|
||||
last_time = time.time()
|
||||
for idx, i in enumerate(b()):
|
||||
elapsed_time = time.time() - last_time
|
||||
if i == 0:
|
||||
time.sleep(0.3)
|
||||
else:
|
||||
# read time should be short, meaning already buffered.
|
||||
self.assertLess(elapsed_time, 0.01)
|
||||
last_time = time.time()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,39 @@
|
||||
type: "nn"
|
||||
layers {
|
||||
name: "data1"
|
||||
type: "data"
|
||||
size: 30
|
||||
active_type: ""
|
||||
}
|
||||
layers {
|
||||
name: "data2"
|
||||
type: "data"
|
||||
size: 30
|
||||
active_type: ""
|
||||
}
|
||||
layers {
|
||||
name: "__seqconcat_0__"
|
||||
type: "seqconcat"
|
||||
size: 30
|
||||
active_type: ""
|
||||
inputs {
|
||||
input_layer_name: "data1"
|
||||
}
|
||||
inputs {
|
||||
input_layer_name: "data2"
|
||||
}
|
||||
}
|
||||
input_layer_names: "data1"
|
||||
input_layer_names: "data2"
|
||||
output_layer_names: "__seqconcat_0__"
|
||||
sub_models {
|
||||
name: "root"
|
||||
layer_names: "data1"
|
||||
layer_names: "data2"
|
||||
layer_names: "__seqconcat_0__"
|
||||
input_layer_names: "data1"
|
||||
input_layer_names: "data2"
|
||||
output_layer_names: "__seqconcat_0__"
|
||||
is_recurrent_layer_group: false
|
||||
}
|
||||
|
@ -0,0 +1,9 @@
|
||||
from paddle.trainer_config_helpers import *
|
||||
|
||||
settings(batch_size=1000, learning_rate=1e-5)
|
||||
|
||||
din1 = data_layer(name='data1', size=30)
|
||||
|
||||
din2 = data_layer(name='data2', size=30)
|
||||
|
||||
outputs(seq_concat_layer(a=din1, b=din2))
|
@ -1,242 +0,0 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import collections
|
||||
import py_paddle.swig_paddle as api
|
||||
import numpy as np
|
||||
import paddle.trainer.PyDataProvider2 as dp2
|
||||
|
||||
__all__ = ['DataConverter']
|
||||
|
||||
|
||||
class IDataConverter(object):
|
||||
def __init__(self, input_type, pos):
|
||||
"""
|
||||
:param input_type: data type
|
||||
:type input_type: dp2.InputType
|
||||
:param pos: which input, start from 0
|
||||
:type pos: int
|
||||
"""
|
||||
self.input_type = input_type
|
||||
assert isinstance(self.input_type, dp2.InputType)
|
||||
self.pos = pos
|
||||
|
||||
def convert(self, data, argument):
|
||||
"""
|
||||
Conv data to paddle format.
|
||||
:param data: input data
|
||||
:param argument: paddle format
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DenseConvert(IDataConverter):
|
||||
def __init__(self, input_type, pos):
|
||||
IDataConverter.__init__(self, input_type, pos)
|
||||
|
||||
def convert(self, data, argument):
|
||||
"""
|
||||
:param data: input data
|
||||
:type data: list | numpy array
|
||||
:param argument: the type which paddle is acceptable
|
||||
:type argument: Paddle's Arguments
|
||||
"""
|
||||
assert isinstance(argument, api.Arguments)
|
||||
# TODO: handle data type (float, double, ...)
|
||||
data = np.array(data, np.float32)
|
||||
m = api.Matrix.createDenseFromNumpy(data)
|
||||
argument.setSlotValue(self.pos, m)
|
||||
|
||||
|
||||
class SparseBinaryConvert(IDataConverter):
|
||||
def __init__(self, input_type, pos):
|
||||
IDataConverter.__init__(self, input_type, pos)
|
||||
self.__rows__ = [0]
|
||||
self.__cols__ = []
|
||||
self.__height__ = 0
|
||||
self.__nnz__ = 0
|
||||
self.__value__ = []
|
||||
|
||||
def fill_csr(self, data):
|
||||
self.__height__ = len(data)
|
||||
for x in data:
|
||||
self.__rows__.append(self.__rows__[-1] + len(x))
|
||||
self.__cols__.extend(x)
|
||||
|
||||
def convert(self, data, argument):
|
||||
assert isinstance(argument, api.Arguments)
|
||||
|
||||
self.fill_csr(data)
|
||||
m = api.Matrix.createSparse(self.__height__, self.input_type.dim,
|
||||
len(self.__cols__),
|
||||
len(self.__value__) == 0)
|
||||
assert isinstance(m, api.Matrix)
|
||||
m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
|
||||
argument.setSlotValue(self.pos, m)
|
||||
|
||||
|
||||
class SparseFloatConvert(SparseBinaryConvert):
|
||||
def __init__(self, input_type, pos):
|
||||
SparseBinaryConvert.__init__(self, input_type, pos)
|
||||
|
||||
def fill_csr(self, data):
|
||||
self.__height__ = len(data)
|
||||
for x in data:
|
||||
self.__rows__.append(self.__rows__[-1] + len(x))
|
||||
self.__cols__.extend(x[0])
|
||||
self.__value__.extend(x[1])
|
||||
|
||||
|
||||
class IndexConvert(IDataConverter):
|
||||
def __init__(self, input_type, pos):
|
||||
IDataConverter.__init__(self, input_type, pos)
|
||||
self.__ids__ = []
|
||||
|
||||
def convert(self, data, argument):
|
||||
assert isinstance(argument, api.Arguments)
|
||||
#for x in data:
|
||||
# self.__ids__.append(x)
|
||||
self.__ids__.extend(x)
|
||||
|
||||
ids = api.IVector.create(self.__ids__)
|
||||
argument.setSlotIds(self.pos, ids)
|
||||
|
||||
|
||||
class SequenceConvert(IDataConverter):
|
||||
def __init__(self, input_type, pos, inner_convert, setter):
|
||||
"""
|
||||
:param input_type: the type of input data
|
||||
:type input_type: dp2.InputType
|
||||
:param pos: the position of this input
|
||||
:type pos: int
|
||||
:param inner_convert: DataConvert type
|
||||
:type inner_convert: DenseConvert|SparseBinaryConvert|
|
||||
SparseFloatConvert|IndexConvert
|
||||
:param setter:
|
||||
:type setter:
|
||||
"""
|
||||
IDataConverter.__init__(self, input_type, pos)
|
||||
self.__seq__ = [0]
|
||||
self.__inner_convert__ = inner_convert
|
||||
self.__setter__ = setter
|
||||
|
||||
def fill_seq(self, data):
|
||||
for each in data:
|
||||
self.__seq__.append(self.__seq__[-1] + self.get_size(each))
|
||||
|
||||
def convert(self, data, argument):
|
||||
fill_seq(data)
|
||||
seq = api.IVector.create(self.__seq__, False)
|
||||
self.__setter__(argument, self.pos, seq)
|
||||
|
||||
dat = []
|
||||
for each in data:
|
||||
dat.append(each)
|
||||
self.__inner_scanner__.convert(dat, argument)
|
||||
|
||||
def get_size(self, data):
|
||||
if isinstance(self.__inner_scanner__, SequenceConvert):
|
||||
return sum(self.__inner_scanner__.get_size(item) for item in dat)
|
||||
else:
|
||||
return len(data)
|
||||
|
||||
|
||||
class DataConverter(object):
|
||||
def __init__(self, input):
|
||||
"""
|
||||
Usege:
|
||||
|
||||
.. code-block:: python
|
||||
inputs = [('image', dense_vector), ('label', integer_value)]
|
||||
cvt = DataConverter(inputs)
|
||||
arg = cvt(minibatch_data, {'image':0, 'label':1})
|
||||
|
||||
:param input_mapper: list of (input_name, input_type)
|
||||
:type input_mapper: list
|
||||
"""
|
||||
self.input_names = []
|
||||
self.input_types = []
|
||||
for each in input:
|
||||
self.input_names.append(each[0])
|
||||
self.input_types.append(each[1])
|
||||
assert isinstance(each[1], dp2.InputType)
|
||||
|
||||
def convert(self, data, input_dict=None, argument=None):
|
||||
"""
|
||||
Convert minibatch data to Paddle's argument. The data is numpy array
|
||||
or list.
|
||||
|
||||
:param data: input samples, for example, [column0, column1, ...] or
|
||||
(column0, column1, ...) each column is one minibatch
|
||||
feature. Note, if only one column featrue, data also
|
||||
shuld be a list or tupe, [column0] or (column0).
|
||||
:type data: list|tuple
|
||||
:param input_dict: a dictionary to specify the correspondence
|
||||
of data_layer and input data. If None,
|
||||
the feature order in argument and data is the same.
|
||||
:type input_dict: dict, like {string:integer, string, integer, ...}|None
|
||||
:param argument: converted data will be saved in this argument. If None,
|
||||
it will create a Paddle's Arguments firstly.
|
||||
:param type: swig_paddle.Arguments|None
|
||||
"""
|
||||
if argument is None:
|
||||
argument = api.Arguments.createArguments(0)
|
||||
assert isinstance(argument, api.Arguments)
|
||||
argument.resize(len(self.input_types))
|
||||
|
||||
converts = [
|
||||
DataConverter.create_converter(i, each_type)
|
||||
for i, each_type in enumerate(self.input_types)
|
||||
]
|
||||
|
||||
for i, cvt in enumerate(converts):
|
||||
if input_dict is not None:
|
||||
dat = data[input_dict[self.input_names[i]]]
|
||||
else:
|
||||
dat = data[i]
|
||||
cvt.convert(dat, argument)
|
||||
|
||||
return argument
|
||||
|
||||
def __call__(self, dat, argument=None):
|
||||
return self.convert(dat, argument)
|
||||
|
||||
@staticmethod
|
||||
def create_converter(pos, each):
|
||||
assert isinstance(each, dp2.InputType)
|
||||
retv = None
|
||||
if each.type == dp2.DataType.Dense:
|
||||
retv = DenseConvert(each, pos)
|
||||
elif each.type == dp2.DataType.Index:
|
||||
retv = IndexConvert(each, pos)
|
||||
elif each.type == dp2.DataType.SparseNonValue:
|
||||
retv = SparseBinaryConvert(each, pos)
|
||||
elif each.type == dp2.DataType.SparseValue:
|
||||
retv = SparseFloatConvert(each, pos)
|
||||
assert retv is not None
|
||||
|
||||
if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
|
||||
retv = SequenceConvert(
|
||||
each, pos, retv,
|
||||
lambda arg, pos, seq: arg.setSlotSubSequenceStartPositions(pos, seq)
|
||||
)
|
||||
|
||||
if each.seq_type in [
|
||||
dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
|
||||
]:
|
||||
retv = SequenceConvert(
|
||||
each, pos, retv,
|
||||
lambda arg, pos, seq: arg.setSlotSequenceStartPositions(pos, seq)
|
||||
)
|
||||
return retv
|
@ -1,92 +0,0 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import py_paddle.swig_paddle as api
|
||||
import numpy as np
|
||||
import paddle.trainer.PyDataProvider2 as dp2
|
||||
|
||||
from paddle.v2.data_converter import DataConverter
|
||||
|
||||
|
||||
class DataConverterTest(unittest.TestCase):
|
||||
def dense_reader(self, shape):
|
||||
data = np.random.random(shape)
|
||||
return data
|
||||
|
||||
def sparse_binary_reader(self,
|
||||
high,
|
||||
size_limit,
|
||||
batch_size,
|
||||
non_empty=False):
|
||||
data = []
|
||||
for i in xrange(batch_size):
|
||||
num = np.random.randint(size_limit) # num could be 0
|
||||
while non_empty and num == 0:
|
||||
num = np.random.randint(size_limit)
|
||||
data.append(np.random.randint(high, size=num).tolist())
|
||||
|
||||
return data
|
||||
|
||||
def test_dense_vector(self):
|
||||
def compare(input):
|
||||
converter = DataConverter([('image', dp2.dense_vector(784))])
|
||||
arg = converter([input], {'image': 0})
|
||||
output = arg.getSlotValue(0).copyToNumpyMat()
|
||||
input = np.array(input, dtype='float32')
|
||||
self.assertAlmostEqual(input.all(), output.all())
|
||||
|
||||
# test numpy array
|
||||
data = self.dense_reader(shape=[32, 784])
|
||||
compare(data)
|
||||
|
||||
# test list
|
||||
compare(data.tolist())
|
||||
|
||||
#def test_sparse_binary(self):
|
||||
# dim = 100000
|
||||
# data = self.sparse_binary_reader(dim, 5, 2)
|
||||
# converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
|
||||
# arg = converter([data], {'input':0})
|
||||
# output = arg.getSlotValue(0)
|
||||
|
||||
#def test_sparse(self):
|
||||
# dim = 100000
|
||||
# v = self.sparse_binary_reader(dim, 5, 2)
|
||||
# w = []
|
||||
# for dat in data:
|
||||
# x = self.dense_reader(shape=[1, len(dat)])
|
||||
# w.append(x.tolist())
|
||||
# data = []
|
||||
# for each in zip(v, w):
|
||||
# data.append(zip(each[0], each[1]))
|
||||
#
|
||||
# converter = DataConverter([('input', dp2.sparse_binary_vector(dim))])
|
||||
# arg = converter([data], {'input':0})
|
||||
# output = arg.getSlotValue(0)
|
||||
|
||||
def test_integer(self):
|
||||
dim = 100
|
||||
index = np.random.randint(dim, size=32)
|
||||
print index
|
||||
converter = DataConverter([('input', dp2.integer_value(dim))])
|
||||
arg = converter([index], {'input': 0})
|
||||
print arg.getSlotValue(0)
|
||||
output = arg.getSlotValue(0).copyToNumpyArray()
|
||||
print 'output=', output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from py_paddle import DataProviderConverter
|
||||
|
||||
__all__ = ['DataFeeder']
|
||||
|
||||
DataFeeder = DataProviderConverter
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue