Unify DataLoader APIs (#19305)
* unify DataLoader APIs, test=develop * integrate iterable CPU Dataset, test=develop add GPU dataset supporting, test=develop * add unittests for dataset, test=develop * add more docs to dataloader apis, test=develop, test=document_preview * refine doc, test=develop * refine doc again, test=develop * increase coverage, test=developexpand_as_op_1
parent
278dd00322
commit
0436efd6a3
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,221 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import numpy as np
|
||||
import six
|
||||
import os
|
||||
import unittest
|
||||
from simple_nets import simple_fc_net_with_inputs
|
||||
|
||||
BATCH_SIZE = 32
|
||||
BATCH_NUM = 10
|
||||
EPOCH_NUM = 4
|
||||
|
||||
IMAGE_SHAPE = [2, 3]
|
||||
LABEL_SHAPE = [1]
|
||||
|
||||
ALL_WRITTEN_FILES = set()
|
||||
|
||||
|
||||
def get_place_string(p):
|
||||
if isinstance(p, (fluid.CPUPlace or fluid.CUDAPlace)):
|
||||
tmp = fluid.core.Place()
|
||||
tmp.set_place(p)
|
||||
p = tmp
|
||||
|
||||
if p._type() == fluid.CPUPlace()._type():
|
||||
return 'CPUPlace()'
|
||||
else:
|
||||
return 'CUDAPlace()'
|
||||
|
||||
|
||||
def remove_all_written_files():
|
||||
for filename in ALL_WRITTEN_FILES:
|
||||
os.remove(filename)
|
||||
|
||||
|
||||
def write_reader_data_to_file(filename, reader):
|
||||
ALL_WRITTEN_FILES.add(filename)
|
||||
with open(filename, 'w') as fid:
|
||||
for instance_list in reader():
|
||||
for i, instance in enumerate(instance_list):
|
||||
instance = np.reshape(instance, [instance.size, ])
|
||||
fid.write(str(instance.size) + ' ')
|
||||
fid.write(' '.join(map(str, instance)))
|
||||
fid.write(' ')
|
||||
|
||||
fid.write('\n')
|
||||
|
||||
|
||||
def fake_reader(batch_size=BATCH_SIZE, batch_num=BATCH_NUM):
|
||||
def __reader__():
|
||||
iteration = BATCH_SIZE * BATCH_NUM
|
||||
iteration = int(iteration + BATCH_SIZE / 2)
|
||||
for _ in six.moves.range(iteration):
|
||||
image = np.random.random(size=IMAGE_SHAPE).astype('float32')
|
||||
label = np.random.random_integers(
|
||||
size=LABEL_SHAPE, low=0, high=9).astype('int64')
|
||||
yield image, label
|
||||
|
||||
return __reader__
|
||||
|
||||
|
||||
class DatasetLoaderTestBase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.dataset_name = "QueueDataset"
|
||||
self.drop_last = False
|
||||
|
||||
def tearDown(self):
|
||||
return
|
||||
remove_all_written_files()
|
||||
|
||||
def build_network(self):
|
||||
main_prog = fluid.Program()
|
||||
startup_prog = fluid.Program()
|
||||
with fluid.program_guard(main_prog, startup_prog):
|
||||
image = fluid.layers.data(
|
||||
name='image', shape=IMAGE_SHAPE, dtype='float32')
|
||||
label = fluid.layers.data(
|
||||
name='label', shape=LABEL_SHAPE, dtype='int64')
|
||||
|
||||
simple_fc_net_with_inputs(image, label)
|
||||
|
||||
return main_prog, startup_prog, [image, label]
|
||||
|
||||
def check_batch_number(self, place, randomize_batch_num=False):
|
||||
main_prog, startup_prog, feeds = self.build_network()
|
||||
dataset = fluid.DatasetFactory().create_dataset(self.dataset_name)
|
||||
dataset.set_batch_size(BATCH_SIZE)
|
||||
|
||||
if isinstance(place, fluid.CPUPlace):
|
||||
file_num = 10
|
||||
os.environ['CPU_NUM'] = str(file_num)
|
||||
places = fluid.cpu_places()
|
||||
use_cuda = False
|
||||
else:
|
||||
file_num = fluid.core.get_cuda_device_count()
|
||||
places = fluid.cuda_places()
|
||||
use_cuda = True
|
||||
|
||||
filelist = []
|
||||
if file_num > 1 and randomize_batch_num:
|
||||
random_delta_batch_size = np.random.random_integers(
|
||||
low=-BATCH_NUM / 2, high=BATCH_NUM / 2, size=[file_num])
|
||||
random_delta_batch_size[-1] = -int(
|
||||
np.sum(random_delta_batch_size[0:-1]))
|
||||
else:
|
||||
random_delta_batch_size = np.zeros(shape=[file_num])
|
||||
|
||||
for i in six.moves.range(file_num):
|
||||
filename = 'dataset_test_{}.txt'.format(i)
|
||||
filelist.append(filename)
|
||||
write_reader_data_to_file(
|
||||
filename,
|
||||
fake_reader(batch_num=BATCH_NUM + random_delta_batch_size[i]))
|
||||
|
||||
dataset.set_filelist(filelist)
|
||||
dataset.set_use_var(feeds)
|
||||
dataset.set_pipe_command("cat")
|
||||
if self.dataset_name == 'InMemoryDataset':
|
||||
dataset.load_into_memory()
|
||||
|
||||
dataloader = fluid.io.DataLoader.from_dataset(
|
||||
dataset=dataset, places=places, drop_last=self.drop_last)
|
||||
prog = fluid.CompiledProgram(main_prog).with_data_parallel()
|
||||
exe = fluid.Executor(place)
|
||||
|
||||
exe.run(startup_prog)
|
||||
|
||||
for _ in six.moves.range(EPOCH_NUM):
|
||||
has_complete_batch = False
|
||||
for batch_id, data in enumerate(dataloader):
|
||||
self.assertEquals(len(places), len(data))
|
||||
for idx, data_on_each_device in enumerate(data):
|
||||
image = data_on_each_device["image"]
|
||||
label = data_on_each_device["label"]
|
||||
|
||||
if self.drop_last:
|
||||
batch_size = BATCH_SIZE
|
||||
else:
|
||||
if batch_id == BATCH_NUM:
|
||||
batch_size = BATCH_SIZE / 2
|
||||
else:
|
||||
batch_size = BATCH_SIZE
|
||||
|
||||
self.assertEquals(image.shape()[1:], IMAGE_SHAPE)
|
||||
self.assertTrue(
|
||||
image._place()._equals(places[idx]),
|
||||
msg=get_place_string(image._place()) + ' vs ' +
|
||||
get_place_string(places[idx]))
|
||||
if self.drop_last:
|
||||
self.assertEquals(image.shape()[0], BATCH_SIZE)
|
||||
else:
|
||||
self.assertTrue(image.shape()[0] == BATCH_SIZE or
|
||||
image.shape()[0] == BATCH_SIZE / 2)
|
||||
|
||||
self.assertEquals(label.shape()[1:], LABEL_SHAPE)
|
||||
self.assertTrue(label._place()._equals(places[idx]))
|
||||
if self.drop_last:
|
||||
self.assertEquals(label.shape()[0], BATCH_SIZE)
|
||||
else:
|
||||
self.assertTrue(label.shape()[0] == BATCH_SIZE or
|
||||
label.shape()[0] == BATCH_SIZE / 2)
|
||||
|
||||
self.assertEquals(image.shape()[0], label.shape()[0])
|
||||
|
||||
if image.shape()[0] == BATCH_SIZE:
|
||||
has_complete_batch = True
|
||||
|
||||
exe.run(prog, feed=data)
|
||||
|
||||
self.assertTrue(has_complete_batch)
|
||||
|
||||
def get_all_places(self):
|
||||
p = [fluid.CPUPlace()]
|
||||
if fluid.is_compiled_with_cuda():
|
||||
p.append(fluid.CUDAPlace(0))
|
||||
return p
|
||||
|
||||
def test_batch_number_with_same_length_files(self):
|
||||
for p in self.get_all_places():
|
||||
with fluid.scope_guard(fluid.Scope()):
|
||||
self.check_batch_number(place=p, randomize_batch_num=False)
|
||||
|
||||
def test_batch_number_with_different_length_files(self):
|
||||
for p in self.get_all_places():
|
||||
with fluid.scope_guard(fluid.Scope()):
|
||||
self.check_batch_number(place=p, randomize_batch_num=True)
|
||||
|
||||
|
||||
class QueueDatasetTestWithoutDropLast(DatasetLoaderTestBase):
|
||||
def setUp(self):
|
||||
self.dataset_name = "QueueDataset"
|
||||
self.drop_last = True
|
||||
|
||||
|
||||
class InMemoryDatasetTestWithoutDropLast(DatasetLoaderTestBase):
|
||||
def setUp(self):
|
||||
self.dataset_name = "InMemoryDataset"
|
||||
self.drop_last = False
|
||||
|
||||
|
||||
class InMemoryDatasetTestWithDropLast(DatasetLoaderTestBase):
|
||||
def setUp(self):
|
||||
self.dataset_name = "InMemoryDataset"
|
||||
self.drop_last = True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,196 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
import numpy as np
|
||||
import time
|
||||
import six
|
||||
import unittest
|
||||
from paddle.fluid.reader import DataLoaderBase
|
||||
|
||||
EPOCH_NUM = 20
|
||||
BATCH_SIZE = 32
|
||||
BATCH_NUM = 20
|
||||
CLASS_NUM = 10
|
||||
|
||||
|
||||
def random_reader():
|
||||
np.random.seed(1)
|
||||
for i in range(BATCH_SIZE * BATCH_NUM):
|
||||
image = np.random.random([784])
|
||||
label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
|
||||
yield image, label
|
||||
|
||||
|
||||
def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
|
||||
startup_prog = fluid.Program()
|
||||
main_prog = fluid.Program()
|
||||
startup_prog.random_seed = 1
|
||||
main_prog.random_seed = 1
|
||||
|
||||
with fluid.unique_name.guard():
|
||||
with fluid.program_guard(main_prog, startup_prog):
|
||||
image = fluid.layers.data(
|
||||
name='image', shape=[784], dtype='float32')
|
||||
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
|
||||
py_reader = fluid.io.DataLoader.from_generator(
|
||||
feed_list=[image, label],
|
||||
capacity=4,
|
||||
iterable=not use_legacy_py_reader,
|
||||
use_double_buffer=use_double_buffer)
|
||||
hidden = image
|
||||
for hidden_size in [10, 20, 30]:
|
||||
hidden = fluid.layers.fc(
|
||||
hidden,
|
||||
size=hidden_size,
|
||||
act='tanh',
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Constant(value=1.0)))
|
||||
|
||||
predict_label = fluid.layers.fc(hidden,
|
||||
size=CLASS_NUM,
|
||||
act='softmax')
|
||||
loss = fluid.layers.mean(
|
||||
fluid.layers.cross_entropy(
|
||||
input=predict_label, label=label))
|
||||
|
||||
optimizer = fluid.optimizer.Adam()
|
||||
optimizer.minimize(loss)
|
||||
return startup_prog, main_prog, py_reader, loss
|
||||
|
||||
|
||||
class TestBase(unittest.TestCase):
|
||||
def run_main(self, use_legacy_py_reader, with_data_parallel, places,
|
||||
use_double_buffer):
|
||||
scope = fluid.Scope()
|
||||
with fluid.scope_guard(scope):
|
||||
startup_prog, main_prog, py_reader, loss = simple_fc_net(
|
||||
places, use_legacy_py_reader, use_double_buffer)
|
||||
|
||||
reader = paddle.batch(random_reader, batch_size=BATCH_SIZE)
|
||||
|
||||
ps = places if use_double_buffer else fluid.cpu_places(len(places))
|
||||
|
||||
py_reader.set_sample_list_generator(
|
||||
reader, places=ps if py_reader.iterable else None)
|
||||
|
||||
exe = fluid.Executor(place=places[0])
|
||||
exe.run(startup_prog)
|
||||
|
||||
prog = fluid.CompiledProgram(main_prog)
|
||||
if with_data_parallel:
|
||||
prog = prog.with_data_parallel(
|
||||
loss_name=loss.name, places=places)
|
||||
|
||||
step = 0
|
||||
step_list = []
|
||||
loss_list = []
|
||||
start_t = time.time()
|
||||
if not py_reader.iterable:
|
||||
for _ in six.moves.range(EPOCH_NUM):
|
||||
step = 0
|
||||
py_reader.start()
|
||||
while True:
|
||||
try:
|
||||
L, = exe.run(program=prog,
|
||||
fetch_list=[loss],
|
||||
use_program_cache=True)
|
||||
loss_list.append(np.mean(L))
|
||||
step += 1
|
||||
except fluid.core.EOFException:
|
||||
py_reader.reset()
|
||||
break
|
||||
step_list.append(step)
|
||||
else:
|
||||
for _ in six.moves.range(EPOCH_NUM):
|
||||
step = 0
|
||||
for d in py_reader():
|
||||
print(d)
|
||||
assert len(d) == len(places), "{} != {}".format(
|
||||
len(d), len(places))
|
||||
for i, item in enumerate(d):
|
||||
image = item['image']
|
||||
label = item['label']
|
||||
assert image.shape() == [BATCH_SIZE, 784]
|
||||
assert label.shape() == [BATCH_SIZE, 1]
|
||||
assert image._place()._equals(ps[i])
|
||||
assert label._place()._equals(ps[i])
|
||||
L, = exe.run(program=prog,
|
||||
feed=d,
|
||||
fetch_list=[loss],
|
||||
use_program_cache=True)
|
||||
loss_list.append(np.mean(L))
|
||||
step += 1
|
||||
step_list.append(step)
|
||||
end_t = time.time()
|
||||
ret = {
|
||||
"time": end_t - start_t,
|
||||
"step": step_list,
|
||||
"loss": np.array(loss_list)
|
||||
}
|
||||
return ret
|
||||
|
||||
def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True):
|
||||
places = []
|
||||
if with_cpu:
|
||||
places.append([fluid.CPUPlace()])
|
||||
if with_data_parallel:
|
||||
places.append([fluid.CPUPlace()] * 2)
|
||||
|
||||
if with_gpu and fluid.core.is_compiled_with_cuda():
|
||||
tmp = fluid.cuda_places()
|
||||
assert len(tmp) > 0, "no gpu detected"
|
||||
if with_data_parallel:
|
||||
places.append(tmp)
|
||||
places.append([tmp[0]])
|
||||
return places
|
||||
|
||||
def test_main(self):
|
||||
for with_data_parallel in [True, False]:
|
||||
for p in self.prepare_places(with_data_parallel):
|
||||
for use_double_buffer in [False, True]:
|
||||
results = []
|
||||
for use_legacy_py_reader in [False, True]:
|
||||
print(p, use_double_buffer, use_legacy_py_reader)
|
||||
ret = self.run_main(
|
||||
use_legacy_py_reader=use_legacy_py_reader,
|
||||
with_data_parallel=with_data_parallel,
|
||||
places=p,
|
||||
use_double_buffer=use_double_buffer)
|
||||
results.append(ret)
|
||||
if not use_double_buffer:
|
||||
diff = np.max(
|
||||
np.abs(results[0]['loss'] - results[1]['loss']))
|
||||
self.assertLess(diff, 1e-3)
|
||||
|
||||
|
||||
class TestDataLoaderBaseAbstract(unittest.TestCase):
|
||||
def test_main(self):
|
||||
loader = DataLoaderBase()
|
||||
try:
|
||||
loader.__iter__()
|
||||
self.assertTrue(False)
|
||||
except NotImplementedError:
|
||||
self.assertTrue(True)
|
||||
|
||||
try:
|
||||
loader.__next__()
|
||||
self.assertTrue(False)
|
||||
except NotImplementedError:
|
||||
self.assertTrue(True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in new issue