From f5ca74ef1974bb11c5c948af783410203a0dd33d Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 2 Jun 2017 16:28:14 +0800 Subject: [PATCH 01/11] fix book link --- doc/getstarted/index_cn.rst | 2 +- doc/getstarted/index_en.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index 0cb27f802c..aa418c657a 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -7,4 +7,4 @@ build_and_install/index_cn.rst concepts/use_concepts_cn.rst -- `深度学习入门课程 `_ +- `深度学习入门课程 `_ diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index 9f771e93e8..be3253e3d4 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -6,4 +6,4 @@ GET STARTED build_and_install/index_en.rst -- `Deep Learning 101 `_ +- `Deep Learning 101 `_ From e73e5cd0231f7b0d7c1587b244fb6a25574a4ef0 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 2 Jun 2017 16:48:10 +0800 Subject: [PATCH 02/11] specify the sphinx version in travis-ci --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 865e21f046..7bffc00ef1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,7 +47,7 @@ before_install: - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker + - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: From 252c84dbe1e4659e5a3b6f10fe80a893e88ecb5c Mon Sep 17 00:00:00 2001 From: xuwei06 Date: Thu, 1 Jun 2017 16:20:38 -0700 Subject: [PATCH 03/11] Correctly handle print_layer in V2 API print_layer is used to print the values of a layer when evaluating the model for debugging purpose. No layer depends on print_layer. It should be added to the topology if it is used to print some layer already in the topology. --- python/paddle/trainer_config_helpers/layers.py | 10 +++++++++- python/paddle/v2/layer.py | 14 ++++++++++++++ python/paddle/v2/tests/test_layer.py | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 81cce31fec..5762e1d159 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -111,6 +111,7 @@ __all__ = [ 'block_expand_layer', 'maxout_layer', 'out_prod_layer', + 'printer_layer', 'print_layer', 'priorbox_layer', 'cross_channel_norm_layer', @@ -969,7 +970,7 @@ def fc_layer(input, @wrap_name_default("print") -def print_layer(input, name=None): +def printer_layer(input, name=None): """ Print the output value of input layers. This layer is useful for debugging. @@ -991,6 +992,13 @@ def print_layer(input, name=None): inputs=[l.name for l in input], ) # this layer don't return anything, can not be input of other layer. +# Keep print_layer for compatibility with V1 API. +# 'print_layer' does not work for V2 API because it will be changed to +# 'print' for V2 API. But 'print' is a reserved key word in python. + + +print_layer = printer_layer + @wrap_name_default("priorbox") def priorbox_layer(input, diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index da2abdd2d1..815635f5dd 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -149,6 +149,20 @@ def __get_used_layers__(output_layers, extra_layers=None): for layer in output_layers: dfs_travel(layer.full_name) + # print layer needs to be specially handled because no other + # layer depends on it. It is used to print the result of some + # layers when running the model for debug purpose. So we explicitly + # add a print layer to the topolty if its input is in the toplogy. + for layer in cp.g_config.model_config.layers: + if layer.type == 'print': + used = True + for inp in layer.inputs: + if inp.input_layer_name not in layer_names: + used = False + break + if used: + layer_names.add(layer.name) + return layer_names diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py index 2d25b1a9dc..f2097e195f 100644 --- a/python/paddle/v2/tests/test_layer.py +++ b/python/paddle/v2/tests/test_layer.py @@ -164,6 +164,7 @@ class OtherLayerTest(unittest.TestCase): maxid = layer.max_id(input=inference) sampling_id = layer.sampling_id(input=inference) eos = layer.eos(input=maxid, eos_id=5) + layer.printer(maxid) print layer.parse_network([maxid, sampling_id, eos]) def test_slicing_joining_layer(self): From 2799b0ec50de669709d7e95ae82b7512426d5387 Mon Sep 17 00:00:00 2001 From: "wanghaoshuang@baidu.com" Date: Wed, 24 May 2017 00:14:07 +0800 Subject: [PATCH 04/11] Add flowers dataset for image classification model --- python/paddle/v2/dataset/flowers.py | 255 ++++++++++++++++++ .../paddle/v2/dataset/tests/flowers_test.py | 51 ++++ python/paddle/v2/image.py | 36 ++- python/paddle/v2/reader/decorator.py | 75 +++++- 4 files changed, 409 insertions(+), 8 deletions(-) create mode 100644 python/paddle/v2/dataset/flowers.py create mode 100644 python/paddle/v2/dataset/tests/flowers_test.py diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py new file mode 100644 index 0000000000..3d38b5dab9 --- /dev/null +++ b/python/paddle/v2/dataset/flowers.py @@ -0,0 +1,255 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +CIFAR dataset. + +This module will download dataset from +http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html +and parse train/test set intopaddle reader creators. + +This set contains images of flowers belonging to 102 different categories. +The images were acquired by searching the web and taking pictures. There are a +minimum of 40 images for each category. + +The database was used in: + +Nilsback, M-E. and Zisserman, A. Automated flower classification over a large + number of classes.Proceedings of the Indian Conference on Computer Vision, +Graphics and Image Processing (2008) +http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}. + +""" +import cPickle +import itertools +from common import download +import tarfile +import scipy.io as scio +from image import * +import os +from multiprocessing import Process +from multiprocessing import Pool +from multiprocessing import cpu_count +import numpy as np +import paddle.v2 as paddle +__all__ = ['train', 'test', 'valid'] + +DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' +LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' +SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' +DATA_MD5 = '52808999861908f626f3c1f4e79d11fa' +LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' +SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' + + +def extract_file(tarFile): + ''' + Extract tar file to tmp dir. + + Example usage: + + .. code-block:: python + tmp = extract_file("/home/work/test.tar.gz") + + :param tarFile: target tar file + :type tarFile: string + :return: extracted dir. For example: + '/home/work/test/' while input is '/home/work/test.tar.gz' + :rtype: string + ''' + base_dir = os.path.dirname(tarFile) + base_name = os.path.basename(tarFile) + if '.' in base_name: + base_name = base_name.split('.', 1)[0] + out_path = '/'.join([base_dir, base_name]) + if not os.path.exists(out_path): + df = tarfile.open(tarFile, mode='r') + df.extractall(path=out_path) + df.close() + return out_path + + +def default_mapper(sample): + ''' + map image bytes data to type needed by model input layer + ''' + img, label = sample + img = paddle.image.load_image_bytes(img) + img = paddle.image.simple_transform(img, 256, 224, True) + return img.flatten().astype('float32'), label + + +def reader_creator(data_file, + label_file, + setid_file, + flag, + mapper=default_mapper): + ''' + 1. extract 102flowers.tgz to 102flowers/ + 2. merge images into batch files in 102flowers_batch/ + 3. get a reader to read sample from batch file + + :param data_file: downloaded data file + :type data_file: string + :param label_file: downloaded label file + :type label_file: string + :param setid_file: downloaded setid file containing information + about how to split dataset + :type setid_file: string + :param flag: data set name (tstid|trnid|valid) + :type flag: string + :param mapper: a function to map image bytes data to type + needed by model input layer + :type mapper: callable + :return: data reader + :rtype: callable + ''' + base_dir = os.path.dirname(data_file) + tmp_dir = extract_file(data_file) + file_list = create_batch(tmp_dir, label_file, setid_file, flag) + + def reader(): + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) + + return paddle.reader.xmap(mapper, reader, cpu_count(), 1024 * 8) + + +def create_batch(data_dir, + label_file, + setid_file, + flag, + numPerBatch=1024, + nThread=16): + batch_dir = data_dir + "_batch" + labels = scio.loadmat(label_file)['labels'][0] + indexes = scio.loadmat(setid_file)[flag][0] + count = len(indexes) + out_path = "%s/%s" % (batch_dir, flag) + meta_file = "%s/%s.txt" % (batch_dir, flag) + + if os.path.exists(out_path): + return meta_file + else: + os.makedirs(out_path) + + def batch(file_out, start, end): + data = [] + labellist = [] + for index in indexes[start:end]: + img_name = "%s/jpg/image_%05d.jpg" % (data_dir, index) + with open(img_name, 'r') as f: + data.append(f.read()) + labellist.append(labels[index - 1]) + output = {} + output['label'] = labellist + output['data'] = data + cPickle.dump( + output, open(file_out, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) + + cur_id = 0 + file_id = 0 + while cur_id < count: + thread = [] + for i in xrange(nThread): + end_id = min(cur_id + numPerBatch, count) + batch_file_name = "%s/batch_%05d" % (out_path, file_id) + w = Process(target=batch, args=(batch_file_name, cur_id, end_id)) + w.daemon = True + thread.append(w) + cur_id = end_id + file_id += 1 + if cur_id == count: + break + for t in thread: + t.start() + for t in thread: + t.join() + with open(meta_file, 'a') as meta: + for file in os.listdir(out_path): + meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n") + return meta_file + + +def train(mapper=default_mapper): + ''' + Create flowers training set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :return: train data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), 'trnid') + + +def test(mapper=default_mapper): + ''' + Create flowers test set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), 'tstid') + + +def valid(): + ''' + Create flowers validation set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), 'valid') + + +def fetch(): + download(DATA_URL, 'flowers', DATA_MD5) + download(LABEL_URL, 'flowers', LABEL_MD5) + download(SETID_URL, 'flowers', SETID_MD5) + + +if __name__ == '__main__': + for i in test()(): + pass diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py new file mode 100644 index 0000000000..cc0626f4fe --- /dev/null +++ b/python/paddle/v2/dataset/tests/flowers_test.py @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.flowers +import unittest + + +class TestFlowers(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + size = 224 * 224 * 3 + for l in reader(): + self.assertEqual(l[0].size, size) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.train()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.test()) + self.assertEqual(instances, 6149) + self.assertEqual(max_label_value, 102) + + def test_valid(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.valid()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py index 85ad6984ba..cb5725de68 100644 --- a/python/paddle/v2/image.py +++ b/python/paddle/v2/image.py @@ -1,14 +1,14 @@ import numpy as np try: import cv2 -except: - print( - "import cv2 error, please install opencv-python: pip install opencv-python" - ) +except ImportError: + cv2 = None + +from cv2 import resize __all__ = [ - "load_image", "resize_short", "to_chw", "center_crop", "random_crop", - "left_right_flip", "simple_transform", "load_and_transform" + "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop", + "random_crop", "left_right_flip", "simple_transform", "load_and_transform" ] """ This file contains some common interfaces for image preprocess. @@ -28,6 +28,28 @@ the image layout as follows. """ +def load_image_bytes(bytes, is_color=True): + """ + Load an color or gray image from bytes array. + + Example usage: + + .. code-block:: python + with open('cat.jpg') as f: + im = load_image(f.read()) + + :param bytes: the input image bytes array. + :type file: str + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + """ + flag = 1 if is_color else 0 + file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8) + img = cv2.imdecode(file_bytes, flag) + return img + + def load_image(file, is_color=True): """ Load an color or gray image from the file path. @@ -76,7 +98,7 @@ def resize_short(im, size): h_new = size * h / w else: w_new = size * w / h - im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + im = resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) return im diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 104ce9a041..f06792314f 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -14,13 +14,15 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn' + 'ComposeNotAligned', 'firstn', 'xmap' ] import itertools import random from Queue import Queue from threading import Thread +from multiprocessing import Queue as MQueue +from multiprocessing import Process def map_readers(func, *readers): @@ -224,3 +226,74 @@ def firstn(reader, n): yield item return firstn_reader + + +class XmapEndSignal(): + pass + + +def xmap(mapper, reader, process_num, buffer_size): + """ + Use multiprocess to map samples from reader by a mapper defined by user. + And this function contains a buffered decorator. + :param mapper: a function to map sample. + :type mapper: callable + :param reader: the data reader to read from + :type reader: callable + :param process_num: process number to handle original sample + :type process_num: int + :param buffer_size: max buffer size + :type buffer_size: int + :return: the decarated reader + :rtype: callable + """ + end = XmapEndSignal() + in_queue = MQueue(buffer_size) + out_queue = MQueue(buffer_size) + + # define a worker to read samples from reader to in_queue + def read_worker(reader, in_queue): + for i in reader(): + in_queue.put(i) + in_queue.put(end) + + # start a read worker in a thread + t = Thread(target=read_worker, args=(reader, in_queue)) + t.daemon = True + t.start() + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue + def handle_worker(in_queue, out_queue, mapper): + sample = in_queue.get() + while not isinstance(sample, XmapEndSignal): + r = mapper(sample) + out_queue.put(r) + sample = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + # start several handle_workers + workers = [] + for i in xrange(process_num): + worker = Process( + target=handle_worker, args=(in_queue, out_queue, mapper)) + worker.daemon = True + workers.append(worker) + for w in workers: + w.start() + + def xreader(): + sample = out_queue.get() + while not isinstance(sample, XmapEndSignal): + yield sample + sample = out_queue.get() + finish = 1 + while finish < process_num: + sample = out_queue.get() + if isinstance(sample, XmapEndSignal): + finish += 1 + else: + yield sample + + return xreader From e62a4d7abe5287fd5fdc3464ef81a5c682a49589 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Fri, 2 Jun 2017 10:56:15 +0800 Subject: [PATCH 05/11] xmap: change multiprocess to multithread. images reader: read the data without untarring the tarball file. image.py: move batch function from reader to image.py --- python/paddle/v2/dataset/flowers.py | 150 +++++++-------------------- python/paddle/v2/image.py | 70 ++++++++++++- python/paddle/v2/reader/decorator.py | 8 +- 3 files changed, 110 insertions(+), 118 deletions(-) diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 3d38b5dab9..d9a39b11df 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -CIFAR dataset. - This module will download dataset from http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html and parse train/test set intopaddle reader creators. @@ -35,13 +33,11 @@ import itertools from common import download import tarfile import scipy.io as scio -from image import * +from paddle.v2.image import * import os -from multiprocessing import Process -from multiprocessing import Pool -from multiprocessing import cpu_count import numpy as np import paddle.v2 as paddle +from multiprocessing import cpu_count __all__ = ['train', 'test', 'valid'] DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' @@ -52,33 +48,6 @@ LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' -def extract_file(tarFile): - ''' - Extract tar file to tmp dir. - - Example usage: - - .. code-block:: python - tmp = extract_file("/home/work/test.tar.gz") - - :param tarFile: target tar file - :type tarFile: string - :return: extracted dir. For example: - '/home/work/test/' while input is '/home/work/test.tar.gz' - :rtype: string - ''' - base_dir = os.path.dirname(tarFile) - base_name = os.path.basename(tarFile) - if '.' in base_name: - base_name = base_name.split('.', 1)[0] - out_path = '/'.join([base_dir, base_name]) - if not os.path.exists(out_path): - df = tarfile.open(tarFile, mode='r') - df.extractall(path=out_path) - df.close() - return out_path - - def default_mapper(sample): ''' map image bytes data to type needed by model input layer @@ -92,12 +61,13 @@ def default_mapper(sample): def reader_creator(data_file, label_file, setid_file, - flag, - mapper=default_mapper): + dataset_name, + mapper=default_mapper, + buffered_size=1024): ''' - 1. extract 102flowers.tgz to 102flowers/ - 2. merge images into batch files in 102flowers_batch/ - 3. get a reader to read sample from batch file + 1. read images from tar file and + merge images into batch files in 102flowers.tgz_batch/ + 2. get a reader to read sample from batch file :param data_file: downloaded data file :type data_file: string @@ -106,17 +76,23 @@ def reader_creator(data_file, :param setid_file: downloaded setid file containing information about how to split dataset :type setid_file: string - :param flag: data set name (tstid|trnid|valid) - :type flag: string + :param dataset_name: data set name (tstid|trnid|valid) + :type dataset_name: string :param mapper: a function to map image bytes data to type needed by model input layer :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int :return: data reader :rtype: callable ''' - base_dir = os.path.dirname(data_file) - tmp_dir = extract_file(data_file) - file_list = create_batch(tmp_dir, label_file, setid_file, flag) + labels = scio.loadmat(label_file)['labels'][0] + indexes = scio.loadmat(setid_file)[dataset_name][0] + img2label = {} + for i in indexes: + img = "jpg/image_%05d.jpg" % i + img2label[img] = labels[i - 1] + file_list = batch_images_from_tar(data_file, dataset_name, img2label) def reader(): for file in open(file_list): @@ -129,66 +105,10 @@ def reader_creator(data_file, for sample, label in itertools.izip(data, batch['label']): yield sample, int(label) - return paddle.reader.xmap(mapper, reader, cpu_count(), 1024 * 8) + return paddle.reader.xmap(mapper, reader, cpu_count(), buffered_size) -def create_batch(data_dir, - label_file, - setid_file, - flag, - numPerBatch=1024, - nThread=16): - batch_dir = data_dir + "_batch" - labels = scio.loadmat(label_file)['labels'][0] - indexes = scio.loadmat(setid_file)[flag][0] - count = len(indexes) - out_path = "%s/%s" % (batch_dir, flag) - meta_file = "%s/%s.txt" % (batch_dir, flag) - - if os.path.exists(out_path): - return meta_file - else: - os.makedirs(out_path) - - def batch(file_out, start, end): - data = [] - labellist = [] - for index in indexes[start:end]: - img_name = "%s/jpg/image_%05d.jpg" % (data_dir, index) - with open(img_name, 'r') as f: - data.append(f.read()) - labellist.append(labels[index - 1]) - output = {} - output['label'] = labellist - output['data'] = data - cPickle.dump( - output, open(file_out, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) - - cur_id = 0 - file_id = 0 - while cur_id < count: - thread = [] - for i in xrange(nThread): - end_id = min(cur_id + numPerBatch, count) - batch_file_name = "%s/batch_%05d" % (out_path, file_id) - w = Process(target=batch, args=(batch_file_name, cur_id, end_id)) - w.daemon = True - thread.append(w) - cur_id = end_id - file_id += 1 - if cur_id == count: - break - for t in thread: - t.start() - for t in thread: - t.join() - with open(meta_file, 'a') as meta: - for file in os.listdir(out_path): - meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n") - return meta_file - - -def train(mapper=default_mapper): +def train(mapper=default_mapper, buffered_size=1024): ''' Create flowers training set reader. It returns a reader, each sample in the reader is @@ -199,16 +119,19 @@ def train(mapper=default_mapper): 3. flatten :param mapper: a function to map sample. :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int :return: train data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), 'trnid') + download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper, + buffered_size) -def test(mapper=default_mapper): +def test(mapper=default_mapper, buffered_size=1024): ''' Create flowers test set reader. It returns a reader, each sample in the reader is @@ -219,16 +142,19 @@ def test(mapper=default_mapper): 3. flatten :param mapper: a function to map sample. :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int :return: test data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), 'tstid') + download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper, + buffered_size) -def valid(): +def valid(mapper=default_mapper, buffered_size=1024): ''' Create flowers validation set reader. It returns a reader, each sample in the reader is @@ -237,19 +163,21 @@ def valid(): 1. resize to 256*256 2. random crop to 224*224 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), 'valid') + download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper, + buffered_size) def fetch(): download(DATA_URL, 'flowers', DATA_MD5) download(LABEL_URL, 'flowers', LABEL_MD5) download(SETID_URL, 'flowers', SETID_MD5) - - -if __name__ == '__main__': - for i in test()(): - pass diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py index cb5725de68..56031e8734 100644 --- a/python/paddle/v2/image.py +++ b/python/paddle/v2/image.py @@ -5,10 +5,14 @@ except ImportError: cv2 = None from cv2 import resize +import os +import tarfile +import cPickle __all__ = [ "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop", - "random_crop", "left_right_flip", "simple_transform", "load_and_transform" + "random_crop", "left_right_flip", "simple_transform", "load_and_transform", + "batch_images_from_tar" ] """ This file contains some common interfaces for image preprocess. @@ -28,6 +32,68 @@ the image layout as follows. """ +def batch_images_from_tar(data_file, + dataset_name, + img2label, + num_per_batch=1024): + """ + Read images from tar file and batch them into batch file. + param data_file: path of image tar file + type data_file: string + param dataset_name: 'train','test' or 'valid' + type dataset_name: string + param img2label: a dic with image file name as key + and image's label as value + type img2label: dic + param num_per_batch: image number per batch file + type num_per_batch: int + return: path of list file containing paths of batch file + rtype: string + """ + batch_dir = data_file + "_batch" + out_path = "%s/%s" % (batch_dir, dataset_name) + meta_file = "%s/%s.txt" % (batch_dir, dataset_name) + + if os.path.exists(out_path): + return meta_file + else: + os.makedirs(out_path) + + tf = tarfile.open(data_file) + mems = tf.getmembers() + data = [] + labels = [] + file_id = 0 + for mem in mems: + if mem.name in img2label: + data.append(tf.extractfile(mem).read()) + labels.append(img2label[mem.name]) + if len(data) == num_per_batch: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + file_id += 1 + data = [] + labels = [] + if len(data) > 0: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + + with open(meta_file, 'a') as meta: + for file in os.listdir(out_path): + meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n") + return meta_file + + def load_image_bytes(bytes, is_color=True): """ Load an color or gray image from bytes array. @@ -36,7 +102,7 @@ def load_image_bytes(bytes, is_color=True): .. code-block:: python with open('cat.jpg') as f: - im = load_image(f.read()) + im = load_image_bytes(f.read()) :param bytes: the input image bytes array. :type file: str diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index f06792314f..1b5df21b3d 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -21,8 +21,6 @@ import itertools import random from Queue import Queue from threading import Thread -from multiprocessing import Queue as MQueue -from multiprocessing import Process def map_readers(func, *readers): @@ -248,8 +246,8 @@ def xmap(mapper, reader, process_num, buffer_size): :rtype: callable """ end = XmapEndSignal() - in_queue = MQueue(buffer_size) - out_queue = MQueue(buffer_size) + in_queue = Queue(buffer_size) + out_queue = Queue(buffer_size) # define a worker to read samples from reader to in_queue def read_worker(reader, in_queue): @@ -276,7 +274,7 @@ def xmap(mapper, reader, process_num, buffer_size): # start several handle_workers workers = [] for i in xrange(process_num): - worker = Process( + worker = Thread( target=handle_worker, args=(in_queue, out_queue, mapper)) worker.daemon = True workers.append(worker) From 990b7d7bc9e7f4963d4555ab2dc1dce40ae28bb3 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Mon, 5 Jun 2017 16:40:12 +0800 Subject: [PATCH 06/11] rename xmap to xmap_readers and remove 'from cv2 import resize' in image.py --- python/paddle/v2/dataset/flowers.py | 3 ++- python/paddle/v2/image.py | 4 +--- python/paddle/v2/reader/decorator.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index d9a39b11df..07c13cf719 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -105,7 +105,8 @@ def reader_creator(data_file, for sample, label in itertools.izip(data, batch['label']): yield sample, int(label) - return paddle.reader.xmap(mapper, reader, cpu_count(), buffered_size) + return paddle.reader.xmap_readers(mapper, reader, + cpu_count(), buffered_size) def train(mapper=default_mapper, buffered_size=1024): diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py index 56031e8734..0d648e9ae6 100644 --- a/python/paddle/v2/image.py +++ b/python/paddle/v2/image.py @@ -3,8 +3,6 @@ try: import cv2 except ImportError: cv2 = None - -from cv2 import resize import os import tarfile import cPickle @@ -164,7 +162,7 @@ def resize_short(im, size): h_new = size * h / w else: w_new = size * w / h - im = resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) return im diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 1b5df21b3d..c76faa596c 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -14,7 +14,7 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap' + 'ComposeNotAligned', 'firstn', 'xmap_readers' ] import itertools @@ -230,7 +230,7 @@ class XmapEndSignal(): pass -def xmap(mapper, reader, process_num, buffer_size): +def xmap_readers(mapper, reader, process_num, buffer_size): """ Use multiprocess to map samples from reader by a mapper defined by user. And this function contains a buffered decorator. From 6cb6a548a9af17dcfe09d47b1177cc24a2cbdb7e Mon Sep 17 00:00:00 2001 From: liaogang Date: Mon, 5 Jun 2017 17:55:53 +0800 Subject: [PATCH 07/11] rename CMAKE_CURRENT_LIST_DIR to CMAKE_CURRENT_SOURCE_DIR --- cmake/cpplint.cmake | 2 +- cmake/generic.cmake | 6 +++--- go/cmake/golang.cmake | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake index 02a5c0b2c9..48f705818b 100644 --- a/cmake/cpplint.cmake +++ b/cmake/cpplint.cmake @@ -59,7 +59,7 @@ macro(add_style_check_target TARGET_NAME) "--filter=${STYLE_FILTER}" "--write-success=${CUR_GEN}" ${filename} DEPENDS ${filename} - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endforeach() endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 052530608e..43cd6b398b 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -182,7 +182,7 @@ function(go_library TARGET_NAME) COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}" ${go_library_SRCS} - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS}) add_library(${TARGET_NAME} STATIC IMPORTED) set_property(TARGET ${TARGET_NAME} PROPERTY @@ -199,7 +199,7 @@ function(go_binary TARGET_NAME) COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" ${go_library_SRCS} - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS}) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin) endfunction(go_binary) @@ -213,7 +213,7 @@ function(go_test TARGET_NAME) COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" ${go_test_SRCS} - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS}) add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}) endfunction(go_test) diff --git a/go/cmake/golang.cmake b/go/cmake/golang.cmake index e73b0c865b..d38d06de23 100644 --- a/go/cmake/golang.cmake +++ b/go/cmake/golang.cmake @@ -39,7 +39,7 @@ function(GO_LIBRARY NAME BUILD_TYPE) COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}" ${CMAKE_GO_FLAGS} ${GO_SOURCE} - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN}) add_dependencies(${NAME} goGet) From 35d03c847c4492d7c516b3c2f2b57c50edccb0fc Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 6 Jun 2017 11:19:22 +0800 Subject: [PATCH 08/11] remove VERBOSE in ctest --- paddle/scripts/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 9f0f9f2d74..40e2b72330 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -58,7 +58,7 @@ EOF make -j `nproc` if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then pip uninstall -y py-paddle paddle || true - ctest -V + ctest fi From f202d4e036788378f4bd36b2d3dcd7bf50ed24d9 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 6 Jun 2017 11:21:55 +0800 Subject: [PATCH 09/11] ctest output on failure --- paddle/scripts/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 40e2b72330..2b48e4dc0f 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -58,7 +58,7 @@ EOF make -j `nproc` if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then pip uninstall -y py-paddle paddle || true - ctest + ctest --output-on-failure fi From 366ea1d879d88c43aecc6c9eed14ca0e19e61fcb Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 5 Jun 2017 23:29:22 -0700 Subject: [PATCH 10/11] Remove DYNAMIC_ARCH when building openblas for arm-based archiectures. --- cmake/cudnn.cmake | 17 +++++------------ cmake/external/openblas.cmake | 27 ++++++++++++++++----------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 92dce20c69..69f40df516 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -11,23 +11,16 @@ find_path(CUDNN_INCLUDE_DIR cudnn.h get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) -if(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR}) - execute_process( - COMMAND uname -m COMMAND tr -d '\n' - OUTPUT_VARIABLE HOST_ARCH - RESULT_VARIABLE UNAME_RESULT) - if(${UNAME_RESULT}) - set(HOST_ARCH "x86_64") - endif(${UNAME_RESULT}) -else(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR}) - set(HOST_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR}) -endif(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR}) +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() list(APPEND CUDNN_CHECK_LIBRARY_DIRS ${CUDNN_ROOT} ${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib - ${CUDNN_ROOT}/lib/${HOST_ARCH}-linux-gnu + ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index cb67793cf9..46a2dca442 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -24,20 +24,25 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) - SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1) + SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs) - IF(ANDROID) - # arm_soft_fp_abi branch of OpenBLAS to support softfp - # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi - SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0 libs) - ELSEIF(RPI) - # use hardfp - SET(OPENBLAS_COMMIT "v0.2.19") - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0 libs) + IF(CMAKE_CROSSCOMPILING) + IF(ANDROID) + # arm_soft_fp_abi branch of OpenBLAS to support softfp + # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi + SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) + ELSEIF(RPI) + # use hardfp + SET(OPENBLAS_COMMIT "v0.2.19") + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0) + ENDIF() ELSE() SET(OPENBLAS_COMMIT "v0.2.19") - SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 libs NUM_THREADS=64) + SET(OPTIONAL_ARGS "") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(-64)?$") + SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) + ENDIF() ENDIF() ExternalProject_Add( From e5d33e7760f56adb135de4c322a4420851da0639 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 6 Jun 2017 06:53:41 +0000 Subject: [PATCH 11/11] Fix typo. --- cmake/external/openblas.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 46a2dca442..2341e3785b 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -40,7 +40,7 @@ IF(NOT ${CBLAS_FOUND}) ELSE() SET(OPENBLAS_COMMIT "v0.2.19") SET(OPTIONAL_ARGS "") - IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(-64)?$") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) ENDIF() ENDIF()