You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/dataset/common.py

214 lines
6.8 KiB

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import requests
import hashlib
import os
import errno
import shutil
import six
import sys
import importlib
import paddle.dataset
import six.moves.cPickle as pickle
import glob
__all__ = [
'DATA_HOME',
'download',
'md5file',
'split',
'cluster_files_reader',
]
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
# When running unit tests, there could be multiple processes that
# trying to create DATA_HOME directory simultaneously, so we cannot
# use a if condition to check for the existence of the directory;
# instead, we use the filesystem as the synchronization mechanism by
# catching returned errors.
def must_mkdirs(path):
try:
os.makedirs(DATA_HOME)
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
pass
must_mkdirs(DATA_HOME)
def md5file(fname):
hash_md5 = hashlib.md5()
f = open(fname, "rb")
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
f.close()
return hash_md5.hexdigest()
def download(url, module_name, md5sum, save_name=None):
dirname = os.path.join(DATA_HOME, module_name)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = os.path.join(dirname,
url.split('/')[-1]
if save_name is None else save_name)
if os.path.exists(filename) and md5file(filename) == md5sum:
return filename
retry = 0
retry_limit = 3
while not (os.path.exists(filename) and md5file(filename) == md5sum):
if os.path.exists(filename):
sys.stderr.write("file %s md5 %s\n" % (md5file(filename), md5sum))
if retry < retry_limit:
retry += 1
else:
raise RuntimeError("Cannot download {0} within retry limit {1}".
format(url, retry_limit))
sys.stderr.write("Cache file %s not found, downloading %s \n" %
(filename, url))
sys.stderr.write("Begin to download\n")
try:
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
if total_length is None:
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
else:
with open(filename, 'wb') as f:
chunk_size = 4096
total_length = int(total_length)
total_iter = total_length / chunk_size + 1
log_interval = total_iter / 20 if total_iter > 20 else 1
log_index = 0
for data in r.iter_content(chunk_size=chunk_size):
if six.PY2:
data = six.b(data)
f.write(data)
log_index += 1
if log_index % log_interval == 0:
sys.stderr.write(".")
sys.stdout.flush()
except Exception as e:
# re-try
continue
sys.stderr.write("\nDownload finished\n")
sys.stdout.flush()
return filename
def fetch_all():
for module_name in [
x for x in dir(paddle.dataset) if not x.startswith("__")
]:
if "fetch" in dir(
importlib.import_module("paddle.dataset.%s" % module_name)):
getattr(
importlib.import_module("paddle.dataset.%s" % module_name),
"fetch")()
def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
"""
you can call the function as:
split(paddle.dataset.cifar.train10(), line_count=1000,
suffix="imikolov-train-%05d.pickle")
the output files as:
|-imikolov-train-00000.pickle
|-imikolov-train-00001.pickle
|- ...
|-imikolov-train-00480.pickle
:param reader: is a reader creator
:param line_count: line count for each file
:param suffix: the suffix for the output files, should contain "%d"
means the id for each file. Default is "%05d.pickle"
:param dumper: is a callable function that dump object to file, this
function will be called as dumper(obj, f) and obj is the object
will be dumped, f is a file object. Default is cPickle.dump.
"""
if not callable(dumper):
raise TypeError("dumper should be callable.")
lines = []
indx_f = 0
for i, d in enumerate(reader()):
lines.append(d)
if i >= line_count and i % line_count == 0:
with open(suffix % indx_f, "w") as f:
dumper(lines, f)
lines = []
indx_f += 1
if lines:
with open(suffix % indx_f, "w") as f:
dumper(lines, f)
def cluster_files_reader(files_pattern,
trainer_count,
trainer_id,
loader=pickle.load):
"""
Create a reader that yield element from the given files, select
a file set according trainer count and trainer_id
:param files_pattern: the files which generating by split(...)
:param trainer_count: total trainer count
:param trainer_id: the trainer rank id
:param loader: is a callable function that load object from file, this
function will be called as loader(f) and f is a file object.
Default is cPickle.load
"""
def reader():
if not callable(loader):
raise TypeError("loader should be callable.")
file_list = glob.glob(files_pattern)
file_list.sort()
my_file_list = []
for idx, fn in enumerate(file_list):
if idx % trainer_count == trainer_id:
print("append file: %s" % fn)
my_file_list.append(fn)
for fn in my_file_list:
with open(fn, "r") as f:
lines = loader(f)
for line in lines:
yield line
return reader
def _check_exists_and_download(path, url, md5, module_name, download=True):
if path and os.path.exists(path):
return path
if download:
return paddle.dataset.common.download(url, module_name, md5)
else:
raise ValueError('{} not exists and auto download disabled'.format(
path))