You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
214 lines
6.8 KiB
214 lines
6.8 KiB
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import print_function
|
|
|
|
import requests
|
|
import hashlib
|
|
import os
|
|
import errno
|
|
import shutil
|
|
import six
|
|
import sys
|
|
import importlib
|
|
import paddle.dataset
|
|
import six.moves.cPickle as pickle
|
|
import glob
|
|
|
|
__all__ = [
|
|
'DATA_HOME',
|
|
'download',
|
|
'md5file',
|
|
'split',
|
|
'cluster_files_reader',
|
|
]
|
|
|
|
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
|
|
|
|
|
|
# When running unit tests, there could be multiple processes that
|
|
# trying to create DATA_HOME directory simultaneously, so we cannot
|
|
# use a if condition to check for the existence of the directory;
|
|
# instead, we use the filesystem as the synchronization mechanism by
|
|
# catching returned errors.
|
|
def must_mkdirs(path):
|
|
try:
|
|
os.makedirs(DATA_HOME)
|
|
except OSError as exc:
|
|
if exc.errno != errno.EEXIST:
|
|
raise
|
|
pass
|
|
|
|
|
|
must_mkdirs(DATA_HOME)
|
|
|
|
|
|
def md5file(fname):
|
|
hash_md5 = hashlib.md5()
|
|
f = open(fname, "rb")
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
hash_md5.update(chunk)
|
|
f.close()
|
|
return hash_md5.hexdigest()
|
|
|
|
|
|
def download(url, module_name, md5sum, save_name=None):
|
|
dirname = os.path.join(DATA_HOME, module_name)
|
|
if not os.path.exists(dirname):
|
|
os.makedirs(dirname)
|
|
|
|
filename = os.path.join(dirname,
|
|
url.split('/')[-1]
|
|
if save_name is None else save_name)
|
|
|
|
if os.path.exists(filename) and md5file(filename) == md5sum:
|
|
return filename
|
|
|
|
retry = 0
|
|
retry_limit = 3
|
|
while not (os.path.exists(filename) and md5file(filename) == md5sum):
|
|
if os.path.exists(filename):
|
|
sys.stderr.write("file %s md5 %s\n" % (md5file(filename), md5sum))
|
|
if retry < retry_limit:
|
|
retry += 1
|
|
else:
|
|
raise RuntimeError("Cannot download {0} within retry limit {1}".
|
|
format(url, retry_limit))
|
|
sys.stderr.write("Cache file %s not found, downloading %s \n" %
|
|
(filename, url))
|
|
sys.stderr.write("Begin to download\n")
|
|
try:
|
|
r = requests.get(url, stream=True)
|
|
total_length = r.headers.get('content-length')
|
|
|
|
if total_length is None:
|
|
with open(filename, 'wb') as f:
|
|
shutil.copyfileobj(r.raw, f)
|
|
else:
|
|
with open(filename, 'wb') as f:
|
|
chunk_size = 4096
|
|
total_length = int(total_length)
|
|
total_iter = total_length / chunk_size + 1
|
|
log_interval = total_iter / 20 if total_iter > 20 else 1
|
|
log_index = 0
|
|
for data in r.iter_content(chunk_size=chunk_size):
|
|
if six.PY2:
|
|
data = six.b(data)
|
|
f.write(data)
|
|
log_index += 1
|
|
if log_index % log_interval == 0:
|
|
sys.stderr.write(".")
|
|
sys.stdout.flush()
|
|
except Exception as e:
|
|
# re-try
|
|
continue
|
|
sys.stderr.write("\nDownload finished\n")
|
|
sys.stdout.flush()
|
|
return filename
|
|
|
|
|
|
def fetch_all():
|
|
for module_name in [
|
|
x for x in dir(paddle.dataset) if not x.startswith("__")
|
|
]:
|
|
if "fetch" in dir(
|
|
importlib.import_module("paddle.dataset.%s" % module_name)):
|
|
getattr(
|
|
importlib.import_module("paddle.dataset.%s" % module_name),
|
|
"fetch")()
|
|
|
|
|
|
def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
|
|
"""
|
|
you can call the function as:
|
|
|
|
split(paddle.dataset.cifar.train10(), line_count=1000,
|
|
suffix="imikolov-train-%05d.pickle")
|
|
|
|
the output files as:
|
|
|
|
|-imikolov-train-00000.pickle
|
|
|-imikolov-train-00001.pickle
|
|
|- ...
|
|
|-imikolov-train-00480.pickle
|
|
|
|
:param reader: is a reader creator
|
|
:param line_count: line count for each file
|
|
:param suffix: the suffix for the output files, should contain "%d"
|
|
means the id for each file. Default is "%05d.pickle"
|
|
:param dumper: is a callable function that dump object to file, this
|
|
function will be called as dumper(obj, f) and obj is the object
|
|
will be dumped, f is a file object. Default is cPickle.dump.
|
|
"""
|
|
if not callable(dumper):
|
|
raise TypeError("dumper should be callable.")
|
|
lines = []
|
|
indx_f = 0
|
|
for i, d in enumerate(reader()):
|
|
lines.append(d)
|
|
if i >= line_count and i % line_count == 0:
|
|
with open(suffix % indx_f, "w") as f:
|
|
dumper(lines, f)
|
|
lines = []
|
|
indx_f += 1
|
|
if lines:
|
|
with open(suffix % indx_f, "w") as f:
|
|
dumper(lines, f)
|
|
|
|
|
|
def cluster_files_reader(files_pattern,
|
|
trainer_count,
|
|
trainer_id,
|
|
loader=pickle.load):
|
|
"""
|
|
Create a reader that yield element from the given files, select
|
|
a file set according trainer count and trainer_id
|
|
|
|
:param files_pattern: the files which generating by split(...)
|
|
:param trainer_count: total trainer count
|
|
:param trainer_id: the trainer rank id
|
|
:param loader: is a callable function that load object from file, this
|
|
function will be called as loader(f) and f is a file object.
|
|
Default is cPickle.load
|
|
"""
|
|
|
|
def reader():
|
|
if not callable(loader):
|
|
raise TypeError("loader should be callable.")
|
|
file_list = glob.glob(files_pattern)
|
|
file_list.sort()
|
|
my_file_list = []
|
|
for idx, fn in enumerate(file_list):
|
|
if idx % trainer_count == trainer_id:
|
|
print("append file: %s" % fn)
|
|
my_file_list.append(fn)
|
|
for fn in my_file_list:
|
|
with open(fn, "r") as f:
|
|
lines = loader(f)
|
|
for line in lines:
|
|
yield line
|
|
|
|
return reader
|
|
|
|
|
|
def _check_exists_and_download(path, url, md5, module_name, download=True):
|
|
if path and os.path.exists(path):
|
|
return path
|
|
|
|
if download:
|
|
return paddle.dataset.common.download(url, module_name, md5)
|
|
else:
|
|
raise ValueError('{} not exists and auto download disabled'.format(
|
|
path))
|