Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-python-pad
commit
3c370ee0d5
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,205 @@
|
|||||||
|
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import paddle.fluid.profiler as profiler
|
||||||
|
|
||||||
|
SEED = 1
|
||||||
|
DTYPE = "float32"
|
||||||
|
|
||||||
|
# random seed must set before configuring the network.
|
||||||
|
# fluid.default_startup_program().random_seed = SEED
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser("mnist model benchmark.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--batch_size', type=int, default=128, help='The minibatch size.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--iterations', type=int, default=35, help='The number of minibatches.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--pass_num', type=int, default=5, help='The number of passes.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--device',
|
||||||
|
type=str,
|
||||||
|
default='GPU',
|
||||||
|
choices=['CPU', 'GPU'],
|
||||||
|
help='The device type.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--infer_only', action='store_true', help='If set, run forward only.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--use_cprof', action='store_true', help='If set, use cProfile.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--use_nvprof',
|
||||||
|
action='store_true',
|
||||||
|
help='If set, use nvprof for CUDA.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def print_arguments(args):
|
||||||
|
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
|
||||||
|
vars(args)['device'] == 'GPU')
|
||||||
|
print('----------- Configuration Arguments -----------')
|
||||||
|
for arg, value in sorted(vars(args).iteritems()):
|
||||||
|
print('%s: %s' % (arg, value))
|
||||||
|
print('------------------------------------------------')
|
||||||
|
|
||||||
|
|
||||||
|
def cnn_model(data):
|
||||||
|
conv_pool_1 = fluid.nets.simple_img_conv_pool(
|
||||||
|
input=data,
|
||||||
|
filter_size=5,
|
||||||
|
num_filters=20,
|
||||||
|
pool_size=2,
|
||||||
|
pool_stride=2,
|
||||||
|
act="relu")
|
||||||
|
conv_pool_2 = fluid.nets.simple_img_conv_pool(
|
||||||
|
input=conv_pool_1,
|
||||||
|
filter_size=5,
|
||||||
|
num_filters=50,
|
||||||
|
pool_size=2,
|
||||||
|
pool_stride=2,
|
||||||
|
act="relu")
|
||||||
|
|
||||||
|
# TODO(dzhwinter) : refine the initializer and random seed settting
|
||||||
|
SIZE = 10
|
||||||
|
input_shape = conv_pool_2.shape
|
||||||
|
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
|
||||||
|
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
|
||||||
|
|
||||||
|
predict = fluid.layers.fc(
|
||||||
|
input=conv_pool_2,
|
||||||
|
size=SIZE,
|
||||||
|
act="softmax",
|
||||||
|
param_attr=fluid.param_attr.ParamAttr(
|
||||||
|
initializer=fluid.initializer.NormalInitializer(
|
||||||
|
loc=0.0, scale=scale)))
|
||||||
|
return predict
|
||||||
|
|
||||||
|
|
||||||
|
def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
|
||||||
|
test_reader = paddle.batch(
|
||||||
|
paddle.dataset.mnist.test(), batch_size=args.batch_size)
|
||||||
|
test_pass_acc = fluid.average.WeightedAverage()
|
||||||
|
for batch_id, data in enumerate(test_reader()):
|
||||||
|
img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
|
||||||
|
data)).astype(DTYPE)
|
||||||
|
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||||
|
y_data = y_data.reshape([len(y_data), 1])
|
||||||
|
|
||||||
|
acc, weight = exe.run(inference_program,
|
||||||
|
feed={"pixel": img_data,
|
||||||
|
"label": y_data},
|
||||||
|
fetch_list=[batch_acc, batch_size_tensor])
|
||||||
|
test_pass_acc.add(value=acc, weight=weight)
|
||||||
|
pass_acc = test_pass_acc.eval()
|
||||||
|
return pass_acc
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark(model, args):
|
||||||
|
if args.use_cprof:
|
||||||
|
pr = cProfile.Profile()
|
||||||
|
pr.enable()
|
||||||
|
start_time = time.time()
|
||||||
|
# Input data
|
||||||
|
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
|
||||||
|
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
|
||||||
|
|
||||||
|
# Train program
|
||||||
|
predict = model(images)
|
||||||
|
cost = fluid.layers.cross_entropy(input=predict, label=label)
|
||||||
|
avg_cost = fluid.layers.mean(x=cost)
|
||||||
|
|
||||||
|
# Evaluator
|
||||||
|
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
|
||||||
|
batch_acc = fluid.layers.accuracy(
|
||||||
|
input=predict, label=label, total=batch_size_tensor)
|
||||||
|
|
||||||
|
# inference program
|
||||||
|
inference_program = fluid.default_main_program().clone()
|
||||||
|
with fluid.program_guard(inference_program):
|
||||||
|
inference_program = fluid.io.get_inference_program(
|
||||||
|
target_vars=[batch_acc, batch_size_tensor])
|
||||||
|
|
||||||
|
# Optimization
|
||||||
|
opt = fluid.optimizer.AdamOptimizer(
|
||||||
|
learning_rate=0.001, beta1=0.9, beta2=0.999)
|
||||||
|
opt.minimize(avg_cost)
|
||||||
|
|
||||||
|
fluid.memory_optimize(fluid.default_main_program())
|
||||||
|
|
||||||
|
# Initialize executor
|
||||||
|
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
|
||||||
|
# Parameter initialization
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
|
||||||
|
# Reader
|
||||||
|
train_reader = paddle.batch(
|
||||||
|
paddle.dataset.mnist.train(), batch_size=args.batch_size)
|
||||||
|
|
||||||
|
accuracy = fluid.average.WeightedAverage()
|
||||||
|
for pass_id in range(args.pass_num):
|
||||||
|
accuracy.reset()
|
||||||
|
pass_start = time.time()
|
||||||
|
for batch_id, data in enumerate(train_reader()):
|
||||||
|
img_data = np.array(
|
||||||
|
map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
|
||||||
|
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||||
|
y_data = y_data.reshape([len(y_data), 1])
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
outs = exe.run(
|
||||||
|
fluid.default_main_program(),
|
||||||
|
feed={"pixel": img_data,
|
||||||
|
"label": y_data},
|
||||||
|
fetch_list=[avg_cost, batch_acc, batch_size_tensor]
|
||||||
|
) # The accuracy is the accumulation of batches, but not the current batch.
|
||||||
|
accuracy.add(value=outs[1], weight=outs[2])
|
||||||
|
end = time.time()
|
||||||
|
loss = np.array(outs[0])
|
||||||
|
acc = np.array(outs[1])
|
||||||
|
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
|
||||||
|
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
|
||||||
|
|
||||||
|
pass_end = time.time()
|
||||||
|
|
||||||
|
train_avg_acc = accuracy.eval()
|
||||||
|
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
|
||||||
|
inference_program)
|
||||||
|
|
||||||
|
print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
|
||||||
|
(pass_id, train_avg_acc, test_avg_acc,
|
||||||
|
(pass_end - pass_start) / 1000))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_args()
|
||||||
|
print_arguments(args)
|
||||||
|
if args.use_nvprof and args.device == 'GPU':
|
||||||
|
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
|
||||||
|
run_benchmark(cnn_model, args)
|
||||||
|
else:
|
||||||
|
run_benchmark(cnn_model, args)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,49 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# This script benchmarking the PaddlePaddle Fluid on
|
||||||
|
# single thread single GPU.
|
||||||
|
export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
|
||||||
|
|
||||||
|
# disable openmp and mkl parallel
|
||||||
|
#https://github.com/PaddlePaddle/Paddle/issues/7199
|
||||||
|
export MKL_NUM_THREADS=1
|
||||||
|
export OMP_NUM_THREADS=1
|
||||||
|
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
|
||||||
|
if [ $ht -eq 1 ]; then # HT is OFF
|
||||||
|
if [ -z "$KMP_AFFINITY" ]; then
|
||||||
|
export KMP_AFFINITY="granularity=fine,compact,0,0"
|
||||||
|
fi
|
||||||
|
if [ -z "$OMP_DYNAMIC" ]; then
|
||||||
|
export OMP_DYNAMIC="FALSE"
|
||||||
|
fi
|
||||||
|
else # HT is ON
|
||||||
|
if [ -z "$KMP_AFFINITY" ]; then
|
||||||
|
export KMP_AFFINITY="granularity=fine,compact,1,0"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
# disable multi-gpu if have more than one
|
||||||
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
|
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||||
|
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
|
||||||
|
# vgg16
|
||||||
|
# cifar10 gpu cifar10 128
|
||||||
|
FLAGS_benchmark=true python fluid/vgg.py \
|
||||||
|
--device=GPU \
|
||||||
|
--batch_size=128 \
|
||||||
|
--skip_batch_num=5 \
|
||||||
|
--iterations=30 \
|
||||||
|
2>&1 > vgg16_gpu_128.log
|
||||||
|
|
||||||
|
# resnet50
|
||||||
|
# resnet50 gpu cifar10 128
|
||||||
|
FLAGS_benchmark=true python fluid/resnet.py \
|
||||||
|
--device=GPU \
|
||||||
|
--batch_size=128 \
|
||||||
|
--data_set=cifar10 \
|
||||||
|
--model=resnet_cifar10 \
|
||||||
|
--skip_batch_num=5 \
|
||||||
|
--iterations=30 \
|
||||||
|
2>&1 > resnet50_gpu_128.log
|
||||||
|
|
||||||
|
# lstm
|
@ -0,0 +1,209 @@
|
|||||||
|
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import cPickle
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
import paddle.v2.dataset.imdb as imdb
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from paddle.v2 import batch
|
||||||
|
import paddle.fluid.profiler as profiler
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--batch_size',
|
||||||
|
type=int,
|
||||||
|
default=32,
|
||||||
|
help='The sequence number of a batch data. (default: %(default)d)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--emb_dim',
|
||||||
|
type=int,
|
||||||
|
default=512,
|
||||||
|
help='Dimension of embedding table. (default: %(default)d)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--hidden_dim',
|
||||||
|
type=int,
|
||||||
|
default=512,
|
||||||
|
help='Hidden size of lstm unit. (default: %(default)d)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--pass_num',
|
||||||
|
type=int,
|
||||||
|
default=100,
|
||||||
|
help='Epoch number to train. (default: %(default)d)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--device',
|
||||||
|
type=str,
|
||||||
|
default='CPU',
|
||||||
|
choices=['CPU', 'GPU'],
|
||||||
|
help='The device type.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--crop_size',
|
||||||
|
type=int,
|
||||||
|
default=int(os.environ.get('CROP_SIZE', '1500')),
|
||||||
|
help='The max sentence length of input. Since this model use plain RNN,'
|
||||||
|
' Gradient could be explored if sentence is too long')
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
word_dict = imdb.word_dict()
|
||||||
|
|
||||||
|
|
||||||
|
def crop_sentence(reader, crop_size):
|
||||||
|
unk_value = word_dict['<unk>']
|
||||||
|
|
||||||
|
def __impl__():
|
||||||
|
for item in reader():
|
||||||
|
if len([x for x in item[0] if x != unk_value]) < crop_size:
|
||||||
|
yield item
|
||||||
|
|
||||||
|
return __impl__
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
lstm_size = args.hidden_dim
|
||||||
|
|
||||||
|
data = fluid.layers.data(
|
||||||
|
name="words", shape=[1], lod_level=1, dtype='int64')
|
||||||
|
sentence = fluid.layers.embedding(
|
||||||
|
input=data, size=[len(word_dict), args.emb_dim])
|
||||||
|
|
||||||
|
sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
|
||||||
|
|
||||||
|
rnn = fluid.layers.DynamicRNN()
|
||||||
|
with rnn.block():
|
||||||
|
word = rnn.step_input(sentence)
|
||||||
|
prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
|
||||||
|
prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
|
||||||
|
|
||||||
|
def gate_common(
|
||||||
|
ipt,
|
||||||
|
hidden,
|
||||||
|
size, ):
|
||||||
|
gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
|
||||||
|
gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
|
||||||
|
gate = fluid.layers.sums(input=[gate0, gate1])
|
||||||
|
return gate
|
||||||
|
|
||||||
|
forget_gate = fluid.layers.sigmoid(
|
||||||
|
x=gate_common(word, prev_hidden, lstm_size))
|
||||||
|
input_gate = fluid.layers.sigmoid(
|
||||||
|
x=gate_common(word, prev_hidden, lstm_size))
|
||||||
|
output_gate = fluid.layers.sigmoid(
|
||||||
|
x=gate_common(word, prev_hidden, lstm_size))
|
||||||
|
cell_gate = fluid.layers.tanh(
|
||||||
|
x=gate_common(word, prev_hidden, lstm_size))
|
||||||
|
|
||||||
|
cell = fluid.layers.sums(input=[
|
||||||
|
fluid.layers.elementwise_mul(
|
||||||
|
x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
|
||||||
|
x=input_gate, y=cell_gate)
|
||||||
|
])
|
||||||
|
|
||||||
|
hidden = fluid.layers.elementwise_mul(
|
||||||
|
x=output_gate, y=fluid.layers.tanh(x=cell))
|
||||||
|
|
||||||
|
rnn.update_memory(prev_cell, cell)
|
||||||
|
rnn.update_memory(prev_hidden, hidden)
|
||||||
|
rnn.output(hidden)
|
||||||
|
|
||||||
|
last = fluid.layers.sequence_pool(rnn(), 'last')
|
||||||
|
logit = fluid.layers.fc(input=last, size=2, act='softmax')
|
||||||
|
loss = fluid.layers.cross_entropy(
|
||||||
|
input=logit,
|
||||||
|
label=fluid.layers.data(
|
||||||
|
name='label', shape=[1], dtype='int64'))
|
||||||
|
loss = fluid.layers.mean(x=loss)
|
||||||
|
|
||||||
|
# add acc
|
||||||
|
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
|
||||||
|
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
|
||||||
|
shape=[1], dtype='int64'), total=batch_size_tensor)
|
||||||
|
|
||||||
|
inference_program = fluid.default_main_program().clone()
|
||||||
|
with fluid.program_guard(inference_program):
|
||||||
|
inference_program = fluid.io.get_inference_program(
|
||||||
|
target_vars=[batch_acc, batch_size_tensor])
|
||||||
|
|
||||||
|
adam = fluid.optimizer.Adam()
|
||||||
|
adam.minimize(loss)
|
||||||
|
|
||||||
|
fluid.memory_optimize(fluid.default_main_program())
|
||||||
|
|
||||||
|
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
|
||||||
|
def train_loop(pass_num, crop_size):
|
||||||
|
with profiler.profiler(args.device, 'total') as prof:
|
||||||
|
for pass_id in range(pass_num):
|
||||||
|
train_reader = batch(
|
||||||
|
paddle.reader.shuffle(
|
||||||
|
crop_sentence(imdb.train(word_dict), crop_size),
|
||||||
|
buf_size=25000),
|
||||||
|
batch_size=args.batch_size)
|
||||||
|
word_nums = 0
|
||||||
|
pass_start_time = time.time()
|
||||||
|
for batch_id, data in enumerate(train_reader()):
|
||||||
|
tensor_words = to_lodtensor([x[0] for x in data], place)
|
||||||
|
for x in data:
|
||||||
|
word_nums += len(x[0])
|
||||||
|
label = numpy.array([x[1] for x in data]).astype("int64")
|
||||||
|
label = label.reshape((-1, 1))
|
||||||
|
loss_np, acc, weight = exe.run(
|
||||||
|
fluid.default_main_program(),
|
||||||
|
feed={"words": tensor_words,
|
||||||
|
"label": label},
|
||||||
|
fetch_list=[loss, batch_acc, batch_size_tensor])
|
||||||
|
print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
|
||||||
|
(pass_id, batch_id, loss_np, acc))
|
||||||
|
|
||||||
|
pass_end_time = time.time()
|
||||||
|
time_consumed = pass_end_time - pass_start_time
|
||||||
|
words_per_sec = word_nums / time_consumed
|
||||||
|
print("pass_id=%d, sec/pass: %f, words/s: %f" %
|
||||||
|
(pass_id, time_consumed, words_per_sec))
|
||||||
|
|
||||||
|
train_loop(args.pass_num, args.crop_size)
|
||||||
|
|
||||||
|
|
||||||
|
def to_lodtensor(data, place):
|
||||||
|
seq_lens = [len(seq) for seq in data]
|
||||||
|
cur_len = 0
|
||||||
|
lod = [cur_len]
|
||||||
|
for l in seq_lens:
|
||||||
|
cur_len += l
|
||||||
|
lod.append(cur_len)
|
||||||
|
flattened_data = numpy.concatenate(data, axis=0).astype("int64")
|
||||||
|
flattened_data = flattened_data.reshape([len(flattened_data), 1])
|
||||||
|
res = fluid.LoDTensor()
|
||||||
|
res.set(flattened_data, place)
|
||||||
|
res.set_lod([lod])
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,220 @@
|
|||||||
|
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""VGG16 benchmark in Fluid"""
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import paddle.fluid.core as core
|
||||||
|
import argparse
|
||||||
|
import functools
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
'--batch_size', type=int, default=128, help="Batch size for training.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--skip_batch_num',
|
||||||
|
type=int,
|
||||||
|
default=5,
|
||||||
|
help='The first num of minibatch num to skip, for better performance test')
|
||||||
|
parser.add_argument(
|
||||||
|
'--iterations', type=int, default=80, help='The number of minibatches.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--learning_rate',
|
||||||
|
type=float,
|
||||||
|
default=1e-3,
|
||||||
|
help="Learning rate for training.")
|
||||||
|
parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--device',
|
||||||
|
type=str,
|
||||||
|
default='GPU',
|
||||||
|
choices=['CPU', 'GPU'],
|
||||||
|
help="The device type.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--data_format',
|
||||||
|
type=str,
|
||||||
|
default='NCHW',
|
||||||
|
choices=['NCHW', 'NHWC'],
|
||||||
|
help='The data order, now only support NCHW.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--data_set',
|
||||||
|
type=str,
|
||||||
|
default='cifar10',
|
||||||
|
choices=['cifar10', 'flowers'],
|
||||||
|
help='Optional dataset for benchmark.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--with_test',
|
||||||
|
action='store_true',
|
||||||
|
help='If set, test the testset during training.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def vgg16_bn_drop(input):
|
||||||
|
def conv_block(input, num_filter, groups, dropouts):
|
||||||
|
return fluid.nets.img_conv_group(
|
||||||
|
input=input,
|
||||||
|
pool_size=2,
|
||||||
|
pool_stride=2,
|
||||||
|
conv_num_filter=[num_filter] * groups,
|
||||||
|
conv_filter_size=3,
|
||||||
|
conv_act='relu',
|
||||||
|
conv_with_batchnorm=True,
|
||||||
|
conv_batchnorm_drop_rate=dropouts,
|
||||||
|
pool_type='max')
|
||||||
|
|
||||||
|
conv1 = conv_block(input, 64, 2, [0.3, 0])
|
||||||
|
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
|
||||||
|
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
|
||||||
|
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
|
||||||
|
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
|
||||||
|
|
||||||
|
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
|
||||||
|
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
|
||||||
|
bn = fluid.layers.batch_norm(input=fc1, act='relu')
|
||||||
|
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
|
||||||
|
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
|
||||||
|
return fc2
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if args.data_set == "cifar10":
|
||||||
|
classdim = 10
|
||||||
|
if args.data_format == 'NCHW':
|
||||||
|
data_shape = [3, 32, 32]
|
||||||
|
else:
|
||||||
|
data_shape = [32, 32, 3]
|
||||||
|
else:
|
||||||
|
classdim = 102
|
||||||
|
if args.data_format == 'NCHW':
|
||||||
|
data_shape = [3, 224, 224]
|
||||||
|
else:
|
||||||
|
data_shape = [224, 224, 3]
|
||||||
|
|
||||||
|
# Input data
|
||||||
|
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
|
||||||
|
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
|
||||||
|
|
||||||
|
# Train program
|
||||||
|
net = vgg16_bn_drop(images)
|
||||||
|
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
|
||||||
|
cost = fluid.layers.cross_entropy(input=predict, label=label)
|
||||||
|
avg_cost = fluid.layers.mean(x=cost)
|
||||||
|
|
||||||
|
# Evaluator
|
||||||
|
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
|
||||||
|
batch_acc = fluid.layers.accuracy(
|
||||||
|
input=predict, label=label, total=batch_size_tensor)
|
||||||
|
|
||||||
|
# inference program
|
||||||
|
inference_program = fluid.default_main_program().clone()
|
||||||
|
with fluid.program_guard(inference_program):
|
||||||
|
inference_program = fluid.io.get_inference_program(
|
||||||
|
target_vars=[batch_acc, batch_size_tensor])
|
||||||
|
|
||||||
|
# Optimization
|
||||||
|
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
|
||||||
|
opts = optimizer.minimize(avg_cost)
|
||||||
|
|
||||||
|
fluid.memory_optimize(fluid.default_main_program())
|
||||||
|
|
||||||
|
# Initialize executor
|
||||||
|
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
|
||||||
|
# Parameter initialization
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
|
||||||
|
# data reader
|
||||||
|
train_reader = paddle.batch(
|
||||||
|
paddle.reader.shuffle(
|
||||||
|
paddle.dataset.cifar.train10()
|
||||||
|
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
|
||||||
|
buf_size=5120),
|
||||||
|
batch_size=args.batch_size)
|
||||||
|
test_reader = paddle.batch(
|
||||||
|
paddle.dataset.cifar.test10()
|
||||||
|
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
|
||||||
|
batch_size=args.batch_size)
|
||||||
|
|
||||||
|
# test
|
||||||
|
def test(exe):
|
||||||
|
test_accuracy = fluid.average.WeightedAverage()
|
||||||
|
for batch_id, data in enumerate(test_reader()):
|
||||||
|
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
|
||||||
|
data)).astype("float32")
|
||||||
|
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||||
|
y_data = y_data.reshape([-1, 1])
|
||||||
|
|
||||||
|
acc, weight = exe.run(inference_program,
|
||||||
|
feed={"pixel": img_data,
|
||||||
|
"label": y_data},
|
||||||
|
fetch_list=[batch_acc, batch_size_tensor])
|
||||||
|
test_accuracy.add(value=acc, weight=weight)
|
||||||
|
return test_accuracy.eval()
|
||||||
|
|
||||||
|
iters, num_samples, start_time = 0, 0, time.time()
|
||||||
|
accuracy = fluid.average.WeightedAverage()
|
||||||
|
for pass_id in range(args.pass_num):
|
||||||
|
accuracy.reset()
|
||||||
|
train_accs = []
|
||||||
|
train_losses = []
|
||||||
|
for batch_id, data in enumerate(train_reader()):
|
||||||
|
if iters == args.skip_batch_num:
|
||||||
|
start_time = time.time()
|
||||||
|
num_samples = 0
|
||||||
|
if iters == args.iterations:
|
||||||
|
break
|
||||||
|
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
|
||||||
|
data)).astype("float32")
|
||||||
|
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||||
|
y_data = y_data.reshape([-1, 1])
|
||||||
|
|
||||||
|
loss, acc, weight = exe.run(
|
||||||
|
fluid.default_main_program(),
|
||||||
|
feed={"pixel": img_data,
|
||||||
|
"label": y_data},
|
||||||
|
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
|
||||||
|
accuracy.add(value=acc, weight=weight)
|
||||||
|
iters += 1
|
||||||
|
num_samples += len(data)
|
||||||
|
print(
|
||||||
|
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
|
||||||
|
(pass_id, iters, loss, acc)
|
||||||
|
) # The accuracy is the accumulation of batches, but not the current batch.
|
||||||
|
|
||||||
|
pass_train_acc = accuracy.eval()
|
||||||
|
train_losses.append(loss)
|
||||||
|
train_accs.append(acc)
|
||||||
|
# evaluation
|
||||||
|
if args.with_test:
|
||||||
|
pass_test_acc = test(exe)
|
||||||
|
train_elapsed = time.time() - start_time
|
||||||
|
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
|
||||||
|
(pass_id, np.mean(train_losses), np.mean(train_accs)))
|
||||||
|
|
||||||
|
|
||||||
|
def print_arguments():
|
||||||
|
print('----------- Configuration Arguments -----------')
|
||||||
|
for arg, value in sorted(vars(args).iteritems()):
|
||||||
|
print('%s: %s' % (arg, value))
|
||||||
|
print('------------------------------------------------')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print_arguments()
|
||||||
|
main()
|
@ -0,0 +1,46 @@
|
|||||||
|
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Dataset package.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import mnist
|
||||||
|
import imikolov
|
||||||
|
import imdb
|
||||||
|
import cifar
|
||||||
|
import movielens
|
||||||
|
import conll05
|
||||||
|
import uci_housing
|
||||||
|
import sentiment
|
||||||
|
import wmt14
|
||||||
|
import wmt16
|
||||||
|
import mq2007
|
||||||
|
import flowers
|
||||||
|
import voc2012
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'mnist',
|
||||||
|
'imikolov',
|
||||||
|
'imdb',
|
||||||
|
'cifar',
|
||||||
|
'movielens',
|
||||||
|
'conll05',
|
||||||
|
'sentiment',
|
||||||
|
'uci_housing',
|
||||||
|
'wmt14',
|
||||||
|
'wmt16',
|
||||||
|
'mq2007',
|
||||||
|
'flowers',
|
||||||
|
'voc2012',
|
||||||
|
]
|
@ -0,0 +1,139 @@
|
|||||||
|
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
CIFAR dataset.
|
||||||
|
|
||||||
|
This module will download dataset from
|
||||||
|
https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
|
||||||
|
paddle reader creators.
|
||||||
|
|
||||||
|
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
|
||||||
|
with 6000 images per class. There are 50000 training images and 10000 test
|
||||||
|
images.
|
||||||
|
|
||||||
|
The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
|
||||||
|
containing 600 images each. There are 500 training images and 100 testing
|
||||||
|
images per class.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import cPickle
|
||||||
|
import itertools
|
||||||
|
import numpy
|
||||||
|
import paddle.v2.dataset.common
|
||||||
|
import tarfile
|
||||||
|
|
||||||
|
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
|
||||||
|
|
||||||
|
URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
|
||||||
|
CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
|
||||||
|
CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
|
||||||
|
CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
|
||||||
|
CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
|
||||||
|
|
||||||
|
|
||||||
|
def reader_creator(filename, sub_name):
|
||||||
|
def read_batch(batch):
|
||||||
|
data = batch['data']
|
||||||
|
labels = batch.get('labels', batch.get('fine_labels', None))
|
||||||
|
assert labels is not None
|
||||||
|
for sample, label in itertools.izip(data, labels):
|
||||||
|
yield (sample / 255.0).astype(numpy.float32), int(label)
|
||||||
|
|
||||||
|
def reader():
|
||||||
|
with tarfile.open(filename, mode='r') as f:
|
||||||
|
names = (each_item.name for each_item in f
|
||||||
|
if sub_name in each_item.name)
|
||||||
|
|
||||||
|
for name in names:
|
||||||
|
batch = cPickle.load(f.extractfile(name))
|
||||||
|
for item in read_batch(batch):
|
||||||
|
yield item
|
||||||
|
|
||||||
|
return reader
|
||||||
|
|
||||||
|
|
||||||
|
def train100():
|
||||||
|
"""
|
||||||
|
CIFAR-100 training set creator.
|
||||||
|
|
||||||
|
It returns a reader creator, each sample in the reader is image pixels in
|
||||||
|
[0, 1] and label in [0, 99].
|
||||||
|
|
||||||
|
:return: Training reader creator
|
||||||
|
:rtype: callable
|
||||||
|
"""
|
||||||
|
return reader_creator(
|
||||||
|
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
|
||||||
|
'train')
|
||||||
|
|
||||||
|
|
||||||
|
def test100():
|
||||||
|
"""
|
||||||
|
CIFAR-100 test set creator.
|
||||||
|
|
||||||
|
It returns a reader creator, each sample in the reader is image pixels in
|
||||||
|
[0, 1] and label in [0, 9].
|
||||||
|
|
||||||
|
:return: Test reader creator.
|
||||||
|
:rtype: callable
|
||||||
|
"""
|
||||||
|
return reader_creator(
|
||||||
|
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
|
||||||
|
'test')
|
||||||
|
|
||||||
|
|
||||||
|
def train10():
|
||||||
|
"""
|
||||||
|
CIFAR-10 training set creator.
|
||||||
|
|
||||||
|
It returns a reader creator, each sample in the reader is image pixels in
|
||||||
|
[0, 1] and label in [0, 9].
|
||||||
|
|
||||||
|
:return: Training reader creator
|
||||||
|
:rtype: callable
|
||||||
|
"""
|
||||||
|
return reader_creator(
|
||||||
|
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
|
||||||
|
'data_batch')
|
||||||
|
|
||||||
|
|
||||||
|
def test10():
|
||||||
|
"""
|
||||||
|
CIFAR-10 test set creator.
|
||||||
|
|
||||||
|
It returns a reader creator, each sample in the reader is image pixels in
|
||||||
|
[0, 1] and label in [0, 9].
|
||||||
|
|
||||||
|
:return: Test reader creator.
|
||||||
|
:rtype: callable
|
||||||
|
"""
|
||||||
|
return reader_creator(
|
||||||
|
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
|
||||||
|
'test_batch')
|
||||||
|
|
||||||
|
|
||||||
|
def fetch():
|
||||||
|
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
|
||||||
|
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
|
||||||
|
|
||||||
|
|
||||||
|
def convert(path):
|
||||||
|
"""
|
||||||
|
Converts dataset to recordio format
|
||||||
|
"""
|
||||||
|
paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
|
||||||
|
paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
|
||||||
|
paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
|
||||||
|
paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
|
@ -0,0 +1,236 @@
|
|||||||
|
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import errno
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
import importlib
|
||||||
|
import paddle.v2.dataset
|
||||||
|
import cPickle
|
||||||
|
import glob
|
||||||
|
import cPickle as pickle
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'DATA_HOME',
|
||||||
|
'download',
|
||||||
|
'md5file',
|
||||||
|
'split',
|
||||||
|
'cluster_files_reader',
|
||||||
|
'convert',
|
||||||
|
]
|
||||||
|
|
||||||
|
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
|
||||||
|
|
||||||
|
|
||||||
|
# When running unit tests, there could be multiple processes that
|
||||||
|
# trying to create DATA_HOME directory simultaneously, so we cannot
|
||||||
|
# use a if condition to check for the existence of the directory;
|
||||||
|
# instead, we use the filesystem as the synchronization mechanism by
|
||||||
|
# catching returned errors.
|
||||||
|
def must_mkdirs(path):
|
||||||
|
try:
|
||||||
|
os.makedirs(DATA_HOME)
|
||||||
|
except OSError as exc:
|
||||||
|
if exc.errno != errno.EEXIST:
|
||||||
|
raise
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
must_mkdirs(DATA_HOME)
|
||||||
|
|
||||||
|
|
||||||
|
def md5file(fname):
|
||||||
|
hash_md5 = hashlib.md5()
|
||||||
|
f = open(fname, "rb")
|
||||||
|
for chunk in iter(lambda: f.read(4096), b""):
|
||||||
|
hash_md5.update(chunk)
|
||||||
|
f.close()
|
||||||
|
return hash_md5.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def download(url, module_name, md5sum, save_name=None):
|
||||||
|
dirname = os.path.join(DATA_HOME, module_name)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
|
||||||
|
filename = os.path.join(dirname,
|
||||||
|
url.split('/')[-1]
|
||||||
|
if save_name is None else save_name)
|
||||||
|
|
||||||
|
retry = 0
|
||||||
|
retry_limit = 3
|
||||||
|
while not (os.path.exists(filename) and md5file(filename) == md5sum):
|
||||||
|
if os.path.exists(filename):
|
||||||
|
print "file md5", md5file(filename), md5sum
|
||||||
|
if retry < retry_limit:
|
||||||
|
retry += 1
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Cannot download {0} within retry limit {1}".
|
||||||
|
format(url, retry_limit))
|
||||||
|
print "Cache file %s not found, downloading %s" % (filename, url)
|
||||||
|
r = requests.get(url, stream=True)
|
||||||
|
total_length = r.headers.get('content-length')
|
||||||
|
|
||||||
|
if total_length is None:
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
shutil.copyfileobj(r.raw, f)
|
||||||
|
else:
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
dl = 0
|
||||||
|
total_length = int(total_length)
|
||||||
|
for data in r.iter_content(chunk_size=4096):
|
||||||
|
dl += len(data)
|
||||||
|
f.write(data)
|
||||||
|
done = int(50 * dl / total_length)
|
||||||
|
sys.stdout.write("\r[%s%s]" % ('=' * done,
|
||||||
|
' ' * (50 - done)))
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_all():
|
||||||
|
for module_name in filter(lambda x: not x.startswith("__"),
|
||||||
|
dir(paddle.v2.dataset)):
|
||||||
|
if "fetch" in dir(
|
||||||
|
importlib.import_module("paddle.v2.dataset.%s" % module_name)):
|
||||||
|
getattr(
|
||||||
|
importlib.import_module("paddle.v2.dataset.%s" % module_name),
|
||||||
|
"fetch")()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_all_recordio(path):
|
||||||
|
for module_name in filter(lambda x: not x.startswith("__"),
|
||||||
|
dir(paddle.v2.dataset)):
|
||||||
|
if "convert" in dir(
|
||||||
|
importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
|
||||||
|
not module_name == "common":
|
||||||
|
ds_path = os.path.join(path, module_name)
|
||||||
|
must_mkdirs(ds_path)
|
||||||
|
getattr(
|
||||||
|
importlib.import_module("paddle.v2.dataset.%s" % module_name),
|
||||||
|
"convert")(ds_path)
|
||||||
|
|
||||||
|
|
||||||
|
def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
|
||||||
|
"""
|
||||||
|
you can call the function as:
|
||||||
|
|
||||||
|
split(paddle.v2.dataset.cifar.train10(), line_count=1000,
|
||||||
|
suffix="imikolov-train-%05d.pickle")
|
||||||
|
|
||||||
|
the output files as:
|
||||||
|
|
||||||
|
|-imikolov-train-00000.pickle
|
||||||
|
|-imikolov-train-00001.pickle
|
||||||
|
|- ...
|
||||||
|
|-imikolov-train-00480.pickle
|
||||||
|
|
||||||
|
:param reader: is a reader creator
|
||||||
|
:param line_count: line count for each file
|
||||||
|
:param suffix: the suffix for the output files, should contain "%d"
|
||||||
|
means the id for each file. Default is "%05d.pickle"
|
||||||
|
:param dumper: is a callable function that dump object to file, this
|
||||||
|
function will be called as dumper(obj, f) and obj is the object
|
||||||
|
will be dumped, f is a file object. Default is cPickle.dump.
|
||||||
|
"""
|
||||||
|
if not callable(dumper):
|
||||||
|
raise TypeError("dumper should be callable.")
|
||||||
|
lines = []
|
||||||
|
indx_f = 0
|
||||||
|
for i, d in enumerate(reader()):
|
||||||
|
lines.append(d)
|
||||||
|
if i >= line_count and i % line_count == 0:
|
||||||
|
with open(suffix % indx_f, "w") as f:
|
||||||
|
dumper(lines, f)
|
||||||
|
lines = []
|
||||||
|
indx_f += 1
|
||||||
|
if lines:
|
||||||
|
with open(suffix % indx_f, "w") as f:
|
||||||
|
dumper(lines, f)
|
||||||
|
|
||||||
|
|
||||||
|
def cluster_files_reader(files_pattern,
|
||||||
|
trainer_count,
|
||||||
|
trainer_id,
|
||||||
|
loader=cPickle.load):
|
||||||
|
"""
|
||||||
|
Create a reader that yield element from the given files, select
|
||||||
|
a file set according trainer count and trainer_id
|
||||||
|
|
||||||
|
:param files_pattern: the files which generating by split(...)
|
||||||
|
:param trainer_count: total trainer count
|
||||||
|
:param trainer_id: the trainer rank id
|
||||||
|
:param loader: is a callable function that load object from file, this
|
||||||
|
function will be called as loader(f) and f is a file object.
|
||||||
|
Default is cPickle.load
|
||||||
|
"""
|
||||||
|
|
||||||
|
def reader():
|
||||||
|
if not callable(loader):
|
||||||
|
raise TypeError("loader should be callable.")
|
||||||
|
file_list = glob.glob(files_pattern)
|
||||||
|
file_list.sort()
|
||||||
|
my_file_list = []
|
||||||
|
for idx, fn in enumerate(file_list):
|
||||||
|
if idx % trainer_count == trainer_id:
|
||||||
|
print "append file: %s" % fn
|
||||||
|
my_file_list.append(fn)
|
||||||
|
for fn in my_file_list:
|
||||||
|
with open(fn, "r") as f:
|
||||||
|
lines = loader(f)
|
||||||
|
for line in lines:
|
||||||
|
yield line
|
||||||
|
|
||||||
|
return reader
|
||||||
|
|
||||||
|
|
||||||
|
def convert(output_path, reader, line_count, name_prefix):
|
||||||
|
import recordio
|
||||||
|
"""
|
||||||
|
Convert data from reader to recordio format files.
|
||||||
|
|
||||||
|
:param output_path: directory in which output files will be saved.
|
||||||
|
:param reader: a data reader, from which the convert program will read
|
||||||
|
data instances.
|
||||||
|
:param name_prefix: the name prefix of generated files.
|
||||||
|
:param max_lines_to_shuffle: the max lines numbers to shuffle before
|
||||||
|
writing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
assert line_count >= 1
|
||||||
|
indx_f = 0
|
||||||
|
|
||||||
|
def write_data(indx_f, lines):
|
||||||
|
filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
|
||||||
|
writer = recordio.writer(filename)
|
||||||
|
for l in lines:
|
||||||
|
# FIXME(Yancey1989):
|
||||||
|
# dumps with protocol: pickle.HIGHEST_PROTOCOL
|
||||||
|
writer.write(cPickle.dumps(l))
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for i, d in enumerate(reader()):
|
||||||
|
lines.append(d)
|
||||||
|
if i % line_count == 0 and i >= line_count:
|
||||||
|
write_data(indx_f, lines)
|
||||||
|
lines = []
|
||||||
|
indx_f += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
write_data(indx_f, lines)
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue