Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-python-pad
commit
3c370ee0d5
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,205 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import argparse
|
||||
import time
|
||||
|
||||
import paddle.v2 as paddle
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.profiler as profiler
|
||||
|
||||
SEED = 1
|
||||
DTYPE = "float32"
|
||||
|
||||
# random seed must set before configuring the network.
|
||||
# fluid.default_startup_program().random_seed = SEED
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser("mnist model benchmark.")
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, default=128, help='The minibatch size.')
|
||||
parser.add_argument(
|
||||
'--iterations', type=int, default=35, help='The number of minibatches.')
|
||||
parser.add_argument(
|
||||
'--pass_num', type=int, default=5, help='The number of passes.')
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
type=str,
|
||||
default='GPU',
|
||||
choices=['CPU', 'GPU'],
|
||||
help='The device type.')
|
||||
parser.add_argument(
|
||||
'--infer_only', action='store_true', help='If set, run forward only.')
|
||||
parser.add_argument(
|
||||
'--use_cprof', action='store_true', help='If set, use cProfile.')
|
||||
parser.add_argument(
|
||||
'--use_nvprof',
|
||||
action='store_true',
|
||||
help='If set, use nvprof for CUDA.')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def print_arguments(args):
|
||||
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
|
||||
vars(args)['device'] == 'GPU')
|
||||
print('----------- Configuration Arguments -----------')
|
||||
for arg, value in sorted(vars(args).iteritems()):
|
||||
print('%s: %s' % (arg, value))
|
||||
print('------------------------------------------------')
|
||||
|
||||
|
||||
def cnn_model(data):
|
||||
conv_pool_1 = fluid.nets.simple_img_conv_pool(
|
||||
input=data,
|
||||
filter_size=5,
|
||||
num_filters=20,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
act="relu")
|
||||
conv_pool_2 = fluid.nets.simple_img_conv_pool(
|
||||
input=conv_pool_1,
|
||||
filter_size=5,
|
||||
num_filters=50,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
act="relu")
|
||||
|
||||
# TODO(dzhwinter) : refine the initializer and random seed settting
|
||||
SIZE = 10
|
||||
input_shape = conv_pool_2.shape
|
||||
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
|
||||
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
|
||||
|
||||
predict = fluid.layers.fc(
|
||||
input=conv_pool_2,
|
||||
size=SIZE,
|
||||
act="softmax",
|
||||
param_attr=fluid.param_attr.ParamAttr(
|
||||
initializer=fluid.initializer.NormalInitializer(
|
||||
loc=0.0, scale=scale)))
|
||||
return predict
|
||||
|
||||
|
||||
def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
|
||||
test_reader = paddle.batch(
|
||||
paddle.dataset.mnist.test(), batch_size=args.batch_size)
|
||||
test_pass_acc = fluid.average.WeightedAverage()
|
||||
for batch_id, data in enumerate(test_reader()):
|
||||
img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
|
||||
data)).astype(DTYPE)
|
||||
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||
y_data = y_data.reshape([len(y_data), 1])
|
||||
|
||||
acc, weight = exe.run(inference_program,
|
||||
feed={"pixel": img_data,
|
||||
"label": y_data},
|
||||
fetch_list=[batch_acc, batch_size_tensor])
|
||||
test_pass_acc.add(value=acc, weight=weight)
|
||||
pass_acc = test_pass_acc.eval()
|
||||
return pass_acc
|
||||
|
||||
|
||||
def run_benchmark(model, args):
|
||||
if args.use_cprof:
|
||||
pr = cProfile.Profile()
|
||||
pr.enable()
|
||||
start_time = time.time()
|
||||
# Input data
|
||||
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
|
||||
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
|
||||
|
||||
# Train program
|
||||
predict = model(images)
|
||||
cost = fluid.layers.cross_entropy(input=predict, label=label)
|
||||
avg_cost = fluid.layers.mean(x=cost)
|
||||
|
||||
# Evaluator
|
||||
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
|
||||
batch_acc = fluid.layers.accuracy(
|
||||
input=predict, label=label, total=batch_size_tensor)
|
||||
|
||||
# inference program
|
||||
inference_program = fluid.default_main_program().clone()
|
||||
with fluid.program_guard(inference_program):
|
||||
inference_program = fluid.io.get_inference_program(
|
||||
target_vars=[batch_acc, batch_size_tensor])
|
||||
|
||||
# Optimization
|
||||
opt = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=0.001, beta1=0.9, beta2=0.999)
|
||||
opt.minimize(avg_cost)
|
||||
|
||||
fluid.memory_optimize(fluid.default_main_program())
|
||||
|
||||
# Initialize executor
|
||||
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
|
||||
exe = fluid.Executor(place)
|
||||
|
||||
# Parameter initialization
|
||||
exe.run(fluid.default_startup_program())
|
||||
|
||||
# Reader
|
||||
train_reader = paddle.batch(
|
||||
paddle.dataset.mnist.train(), batch_size=args.batch_size)
|
||||
|
||||
accuracy = fluid.average.WeightedAverage()
|
||||
for pass_id in range(args.pass_num):
|
||||
accuracy.reset()
|
||||
pass_start = time.time()
|
||||
for batch_id, data in enumerate(train_reader()):
|
||||
img_data = np.array(
|
||||
map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
|
||||
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||
y_data = y_data.reshape([len(y_data), 1])
|
||||
|
||||
start = time.time()
|
||||
outs = exe.run(
|
||||
fluid.default_main_program(),
|
||||
feed={"pixel": img_data,
|
||||
"label": y_data},
|
||||
fetch_list=[avg_cost, batch_acc, batch_size_tensor]
|
||||
) # The accuracy is the accumulation of batches, but not the current batch.
|
||||
accuracy.add(value=outs[1], weight=outs[2])
|
||||
end = time.time()
|
||||
loss = np.array(outs[0])
|
||||
acc = np.array(outs[1])
|
||||
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
|
||||
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
|
||||
|
||||
pass_end = time.time()
|
||||
|
||||
train_avg_acc = accuracy.eval()
|
||||
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
|
||||
inference_program)
|
||||
|
||||
print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
|
||||
(pass_id, train_avg_acc, test_avg_acc,
|
||||
(pass_end - pass_start) / 1000))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
print_arguments(args)
|
||||
if args.use_nvprof and args.device == 'GPU':
|
||||
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
|
||||
run_benchmark(cnn_model, args)
|
||||
else:
|
||||
run_benchmark(cnn_model, args)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
# This script benchmarking the PaddlePaddle Fluid on
|
||||
# single thread single GPU.
|
||||
export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
|
||||
|
||||
# disable openmp and mkl parallel
|
||||
#https://github.com/PaddlePaddle/Paddle/issues/7199
|
||||
export MKL_NUM_THREADS=1
|
||||
export OMP_NUM_THREADS=1
|
||||
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
|
||||
if [ $ht -eq 1 ]; then # HT is OFF
|
||||
if [ -z "$KMP_AFFINITY" ]; then
|
||||
export KMP_AFFINITY="granularity=fine,compact,0,0"
|
||||
fi
|
||||
if [ -z "$OMP_DYNAMIC" ]; then
|
||||
export OMP_DYNAMIC="FALSE"
|
||||
fi
|
||||
else # HT is ON
|
||||
if [ -z "$KMP_AFFINITY" ]; then
|
||||
export KMP_AFFINITY="granularity=fine,compact,1,0"
|
||||
fi
|
||||
fi
|
||||
# disable multi-gpu if have more than one
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
|
||||
|
||||
|
||||
# vgg16
|
||||
# cifar10 gpu cifar10 128
|
||||
FLAGS_benchmark=true python fluid/vgg.py \
|
||||
--device=GPU \
|
||||
--batch_size=128 \
|
||||
--skip_batch_num=5 \
|
||||
--iterations=30 \
|
||||
2>&1 > vgg16_gpu_128.log
|
||||
|
||||
# resnet50
|
||||
# resnet50 gpu cifar10 128
|
||||
FLAGS_benchmark=true python fluid/resnet.py \
|
||||
--device=GPU \
|
||||
--batch_size=128 \
|
||||
--data_set=cifar10 \
|
||||
--model=resnet_cifar10 \
|
||||
--skip_batch_num=5 \
|
||||
--iterations=30 \
|
||||
2>&1 > resnet50_gpu_128.log
|
||||
|
||||
# lstm
|
@ -0,0 +1,209 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import cPickle
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy
|
||||
import paddle.v2 as paddle
|
||||
import paddle.v2.dataset.imdb as imdb
|
||||
import paddle.fluid as fluid
|
||||
from paddle.v2 import batch
|
||||
import paddle.fluid.profiler as profiler
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
|
||||
parser.add_argument(
|
||||
'--batch_size',
|
||||
type=int,
|
||||
default=32,
|
||||
help='The sequence number of a batch data. (default: %(default)d)')
|
||||
parser.add_argument(
|
||||
'--emb_dim',
|
||||
type=int,
|
||||
default=512,
|
||||
help='Dimension of embedding table. (default: %(default)d)')
|
||||
parser.add_argument(
|
||||
'--hidden_dim',
|
||||
type=int,
|
||||
default=512,
|
||||
help='Hidden size of lstm unit. (default: %(default)d)')
|
||||
parser.add_argument(
|
||||
'--pass_num',
|
||||
type=int,
|
||||
default=100,
|
||||
help='Epoch number to train. (default: %(default)d)')
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
type=str,
|
||||
default='CPU',
|
||||
choices=['CPU', 'GPU'],
|
||||
help='The device type.')
|
||||
parser.add_argument(
|
||||
'--crop_size',
|
||||
type=int,
|
||||
default=int(os.environ.get('CROP_SIZE', '1500')),
|
||||
help='The max sentence length of input. Since this model use plain RNN,'
|
||||
' Gradient could be explored if sentence is too long')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
word_dict = imdb.word_dict()
|
||||
|
||||
|
||||
def crop_sentence(reader, crop_size):
|
||||
unk_value = word_dict['<unk>']
|
||||
|
||||
def __impl__():
|
||||
for item in reader():
|
||||
if len([x for x in item[0] if x != unk_value]) < crop_size:
|
||||
yield item
|
||||
|
||||
return __impl__
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
lstm_size = args.hidden_dim
|
||||
|
||||
data = fluid.layers.data(
|
||||
name="words", shape=[1], lod_level=1, dtype='int64')
|
||||
sentence = fluid.layers.embedding(
|
||||
input=data, size=[len(word_dict), args.emb_dim])
|
||||
|
||||
sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
|
||||
|
||||
rnn = fluid.layers.DynamicRNN()
|
||||
with rnn.block():
|
||||
word = rnn.step_input(sentence)
|
||||
prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
|
||||
prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
|
||||
|
||||
def gate_common(
|
||||
ipt,
|
||||
hidden,
|
||||
size, ):
|
||||
gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
|
||||
gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
|
||||
gate = fluid.layers.sums(input=[gate0, gate1])
|
||||
return gate
|
||||
|
||||
forget_gate = fluid.layers.sigmoid(
|
||||
x=gate_common(word, prev_hidden, lstm_size))
|
||||
input_gate = fluid.layers.sigmoid(
|
||||
x=gate_common(word, prev_hidden, lstm_size))
|
||||
output_gate = fluid.layers.sigmoid(
|
||||
x=gate_common(word, prev_hidden, lstm_size))
|
||||
cell_gate = fluid.layers.tanh(
|
||||
x=gate_common(word, prev_hidden, lstm_size))
|
||||
|
||||
cell = fluid.layers.sums(input=[
|
||||
fluid.layers.elementwise_mul(
|
||||
x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
|
||||
x=input_gate, y=cell_gate)
|
||||
])
|
||||
|
||||
hidden = fluid.layers.elementwise_mul(
|
||||
x=output_gate, y=fluid.layers.tanh(x=cell))
|
||||
|
||||
rnn.update_memory(prev_cell, cell)
|
||||
rnn.update_memory(prev_hidden, hidden)
|
||||
rnn.output(hidden)
|
||||
|
||||
last = fluid.layers.sequence_pool(rnn(), 'last')
|
||||
logit = fluid.layers.fc(input=last, size=2, act='softmax')
|
||||
loss = fluid.layers.cross_entropy(
|
||||
input=logit,
|
||||
label=fluid.layers.data(
|
||||
name='label', shape=[1], dtype='int64'))
|
||||
loss = fluid.layers.mean(x=loss)
|
||||
|
||||
# add acc
|
||||
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
|
||||
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
|
||||
shape=[1], dtype='int64'), total=batch_size_tensor)
|
||||
|
||||
inference_program = fluid.default_main_program().clone()
|
||||
with fluid.program_guard(inference_program):
|
||||
inference_program = fluid.io.get_inference_program(
|
||||
target_vars=[batch_acc, batch_size_tensor])
|
||||
|
||||
adam = fluid.optimizer.Adam()
|
||||
adam.minimize(loss)
|
||||
|
||||
fluid.memory_optimize(fluid.default_main_program())
|
||||
|
||||
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
|
||||
exe = fluid.Executor(place)
|
||||
exe.run(fluid.default_startup_program())
|
||||
|
||||
def train_loop(pass_num, crop_size):
|
||||
with profiler.profiler(args.device, 'total') as prof:
|
||||
for pass_id in range(pass_num):
|
||||
train_reader = batch(
|
||||
paddle.reader.shuffle(
|
||||
crop_sentence(imdb.train(word_dict), crop_size),
|
||||
buf_size=25000),
|
||||
batch_size=args.batch_size)
|
||||
word_nums = 0
|
||||
pass_start_time = time.time()
|
||||
for batch_id, data in enumerate(train_reader()):
|
||||
tensor_words = to_lodtensor([x[0] for x in data], place)
|
||||
for x in data:
|
||||
word_nums += len(x[0])
|
||||
label = numpy.array([x[1] for x in data]).astype("int64")
|
||||
label = label.reshape((-1, 1))
|
||||
loss_np, acc, weight = exe.run(
|
||||
fluid.default_main_program(),
|
||||
feed={"words": tensor_words,
|
||||
"label": label},
|
||||
fetch_list=[loss, batch_acc, batch_size_tensor])
|
||||
print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
|
||||
(pass_id, batch_id, loss_np, acc))
|
||||
|
||||
pass_end_time = time.time()
|
||||
time_consumed = pass_end_time - pass_start_time
|
||||
words_per_sec = word_nums / time_consumed
|
||||
print("pass_id=%d, sec/pass: %f, words/s: %f" %
|
||||
(pass_id, time_consumed, words_per_sec))
|
||||
|
||||
train_loop(args.pass_num, args.crop_size)
|
||||
|
||||
|
||||
def to_lodtensor(data, place):
|
||||
seq_lens = [len(seq) for seq in data]
|
||||
cur_len = 0
|
||||
lod = [cur_len]
|
||||
for l in seq_lens:
|
||||
cur_len += l
|
||||
lod.append(cur_len)
|
||||
flattened_data = numpy.concatenate(data, axis=0).astype("int64")
|
||||
flattened_data = flattened_data.reshape([len(flattened_data), 1])
|
||||
res = fluid.LoDTensor()
|
||||
res.set(flattened_data, place)
|
||||
res.set_lod([lod])
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,220 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""VGG16 benchmark in Fluid"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import time
|
||||
import numpy as np
|
||||
import paddle.v2 as paddle
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.core as core
|
||||
import argparse
|
||||
import functools
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, default=128, help="Batch size for training.")
|
||||
parser.add_argument(
|
||||
'--skip_batch_num',
|
||||
type=int,
|
||||
default=5,
|
||||
help='The first num of minibatch num to skip, for better performance test')
|
||||
parser.add_argument(
|
||||
'--iterations', type=int, default=80, help='The number of minibatches.')
|
||||
parser.add_argument(
|
||||
'--learning_rate',
|
||||
type=float,
|
||||
default=1e-3,
|
||||
help="Learning rate for training.")
|
||||
parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
type=str,
|
||||
default='GPU',
|
||||
choices=['CPU', 'GPU'],
|
||||
help="The device type.")
|
||||
parser.add_argument(
|
||||
'--data_format',
|
||||
type=str,
|
||||
default='NCHW',
|
||||
choices=['NCHW', 'NHWC'],
|
||||
help='The data order, now only support NCHW.')
|
||||
parser.add_argument(
|
||||
'--data_set',
|
||||
type=str,
|
||||
default='cifar10',
|
||||
choices=['cifar10', 'flowers'],
|
||||
help='Optional dataset for benchmark.')
|
||||
parser.add_argument(
|
||||
'--with_test',
|
||||
action='store_true',
|
||||
help='If set, test the testset during training.')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def vgg16_bn_drop(input):
|
||||
def conv_block(input, num_filter, groups, dropouts):
|
||||
return fluid.nets.img_conv_group(
|
||||
input=input,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
conv_num_filter=[num_filter] * groups,
|
||||
conv_filter_size=3,
|
||||
conv_act='relu',
|
||||
conv_with_batchnorm=True,
|
||||
conv_batchnorm_drop_rate=dropouts,
|
||||
pool_type='max')
|
||||
|
||||
conv1 = conv_block(input, 64, 2, [0.3, 0])
|
||||
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
|
||||
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
|
||||
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
|
||||
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
|
||||
|
||||
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
|
||||
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
|
||||
bn = fluid.layers.batch_norm(input=fc1, act='relu')
|
||||
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
|
||||
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
|
||||
return fc2
|
||||
|
||||
|
||||
def main():
|
||||
if args.data_set == "cifar10":
|
||||
classdim = 10
|
||||
if args.data_format == 'NCHW':
|
||||
data_shape = [3, 32, 32]
|
||||
else:
|
||||
data_shape = [32, 32, 3]
|
||||
else:
|
||||
classdim = 102
|
||||
if args.data_format == 'NCHW':
|
||||
data_shape = [3, 224, 224]
|
||||
else:
|
||||
data_shape = [224, 224, 3]
|
||||
|
||||
# Input data
|
||||
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
|
||||
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
|
||||
|
||||
# Train program
|
||||
net = vgg16_bn_drop(images)
|
||||
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
|
||||
cost = fluid.layers.cross_entropy(input=predict, label=label)
|
||||
avg_cost = fluid.layers.mean(x=cost)
|
||||
|
||||
# Evaluator
|
||||
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
|
||||
batch_acc = fluid.layers.accuracy(
|
||||
input=predict, label=label, total=batch_size_tensor)
|
||||
|
||||
# inference program
|
||||
inference_program = fluid.default_main_program().clone()
|
||||
with fluid.program_guard(inference_program):
|
||||
inference_program = fluid.io.get_inference_program(
|
||||
target_vars=[batch_acc, batch_size_tensor])
|
||||
|
||||
# Optimization
|
||||
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
|
||||
opts = optimizer.minimize(avg_cost)
|
||||
|
||||
fluid.memory_optimize(fluid.default_main_program())
|
||||
|
||||
# Initialize executor
|
||||
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
|
||||
exe = fluid.Executor(place)
|
||||
|
||||
# Parameter initialization
|
||||
exe.run(fluid.default_startup_program())
|
||||
|
||||
# data reader
|
||||
train_reader = paddle.batch(
|
||||
paddle.reader.shuffle(
|
||||
paddle.dataset.cifar.train10()
|
||||
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
|
||||
buf_size=5120),
|
||||
batch_size=args.batch_size)
|
||||
test_reader = paddle.batch(
|
||||
paddle.dataset.cifar.test10()
|
||||
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
|
||||
batch_size=args.batch_size)
|
||||
|
||||
# test
|
||||
def test(exe):
|
||||
test_accuracy = fluid.average.WeightedAverage()
|
||||
for batch_id, data in enumerate(test_reader()):
|
||||
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
|
||||
data)).astype("float32")
|
||||
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||
y_data = y_data.reshape([-1, 1])
|
||||
|
||||
acc, weight = exe.run(inference_program,
|
||||
feed={"pixel": img_data,
|
||||
"label": y_data},
|
||||
fetch_list=[batch_acc, batch_size_tensor])
|
||||
test_accuracy.add(value=acc, weight=weight)
|
||||
return test_accuracy.eval()
|
||||
|
||||
iters, num_samples, start_time = 0, 0, time.time()
|
||||
accuracy = fluid.average.WeightedAverage()
|
||||
for pass_id in range(args.pass_num):
|
||||
accuracy.reset()
|
||||
train_accs = []
|
||||
train_losses = []
|
||||
for batch_id, data in enumerate(train_reader()):
|
||||
if iters == args.skip_batch_num:
|
||||
start_time = time.time()
|
||||
num_samples = 0
|
||||
if iters == args.iterations:
|
||||
break
|
||||
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
|
||||
data)).astype("float32")
|
||||
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
|
||||
y_data = y_data.reshape([-1, 1])
|
||||
|
||||
loss, acc, weight = exe.run(
|
||||
fluid.default_main_program(),
|
||||
feed={"pixel": img_data,
|
||||
"label": y_data},
|
||||
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
|
||||
accuracy.add(value=acc, weight=weight)
|
||||
iters += 1
|
||||
num_samples += len(data)
|
||||
print(
|
||||
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
|
||||
(pass_id, iters, loss, acc)
|
||||
) # The accuracy is the accumulation of batches, but not the current batch.
|
||||
|
||||
pass_train_acc = accuracy.eval()
|
||||
train_losses.append(loss)
|
||||
train_accs.append(acc)
|
||||
# evaluation
|
||||
if args.with_test:
|
||||
pass_test_acc = test(exe)
|
||||
train_elapsed = time.time() - start_time
|
||||
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
|
||||
(pass_id, np.mean(train_losses), np.mean(train_accs)))
|
||||
|
||||
|
||||
def print_arguments():
|
||||
print('----------- Configuration Arguments -----------')
|
||||
for arg, value in sorted(vars(args).iteritems()):
|
||||
print('%s: %s' % (arg, value))
|
||||
print('------------------------------------------------')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print_arguments()
|
||||
main()
|
@ -1,5 +1,143 @@
|
||||
############################
|
||||
Install, Build and Unit test
|
||||
############################
|
||||
.. _install_faq:
|
||||
|
||||
TBD
|
||||
###############################
|
||||
Compile, Install, and Unit Test
|
||||
###############################
|
||||
|
||||
.. contents::
|
||||
|
||||
1. Insufficient CUDA driver version
|
||||
----------------------------------------------------------------
|
||||
|
||||
Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
|
||||
You can solve the issue by running the following commands:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
|
||||
$ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
|
||||
$ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
|
||||
|
||||
For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
|
||||
|
||||
|
||||
2. Version mismatch between PythonLibs and PythonInterpreter
|
||||
----------------------------------------------------------------
|
||||
|
||||
It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path> -DPYTHON_INCLUDE_DIR=<inc_path>
|
||||
|
||||
You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
|
||||
|
||||
3. PaddlePaddle version is 0.0.0
|
||||
------------------------------------------------
|
||||
This issue would happen when you run the code `paddle version` or `cmake ..`
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
CMake Warning at cmake/version.cmake:20 (message):
|
||||
Cannot add paddle version from git tag
|
||||
|
||||
You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
|
||||
|
||||
4. paddlepaddle\*.whl is not a supported wheel on this platform.
|
||||
------------------------------------------------------------------------
|
||||
|
||||
The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
|
||||
|
||||
You can upgrade Pip with the following command\:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install --upgrade pip
|
||||
|
||||
If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
|
||||
|
||||
If the system supports :code:`linux_x86_64` and the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest
|
||||
|
||||
if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
|
||||
|
||||
|
||||
5. ImportError: No module named v2
|
||||
----------------------------------
|
||||
Please uninstall Paddle V1 if you have installed it before.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip uninstall py_paddle paddle
|
||||
|
||||
Then install Python for PaddlePaddle , enter the build directory and run the following commands
|
||||
|
||||
pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
|
||||
|
||||
6. Illegal instruction
|
||||
-----------------------
|
||||
This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
|
||||
|
||||
7. Python unittest fails
|
||||
--------------------------------
|
||||
|
||||
If the following python unittest testcases fail:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
24 - test_PyDataProvider (Failed)
|
||||
26 - test_RecurrentGradientMachine (Failed)
|
||||
27 - test_NetworkCompare (Failed)
|
||||
28 - test_PyDataProvider2 (Failed)
|
||||
32 - test_Prediction (Failed)
|
||||
33 - test_Compare (Failed)
|
||||
34 - test_Trainer (Failed)
|
||||
35 - test_TrainerOnePass (Failed)
|
||||
36 - test_CompareTwoNets (Failed)
|
||||
37 - test_CompareTwoOpts (Failed)
|
||||
38 - test_CompareSparse (Failed)
|
||||
39 - test_recurrent_machine_generation (Failed)
|
||||
40 - test_PyDataProviderWrapper (Failed)
|
||||
41 - test_config_parser (Failed)
|
||||
42 - test_swig_api (Failed)
|
||||
43 - layers_test (Failed)
|
||||
|
||||
Please check the PaddlePaddle unittest logs which may suggest the following:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
paddle package is already in your PYTHONPATH. But unittest need a clean environment.
|
||||
Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
|
||||
|
||||
The solution is:
|
||||
|
||||
* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory. Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
|
||||
|
||||
|
||||
8. Failed to download the MKLML library
|
||||
----------------------------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
|
||||
make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
|
||||
make[1]: *** waiting for the unfinished jobs....
|
||||
|
||||
Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
|
||||
|
||||
The solution is: manually download and install, the specific steps are as follows.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
// 1. enter the directory
|
||||
cd build/third_party/mklml/src/extern_mklml
|
||||
|
||||
// 2. check the size of the package, normally 75M, if less than 75M, the download fails
|
||||
du -sh mklml_lnx_2018.0.1.20171007.tgz
|
||||
|
||||
// 3. manually download and unzip and make the download success tag:
|
||||
wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz
|
||||
tar zxf mklml_lnx_2018.0.1.20171007.tgz
|
||||
touch ../extern_mklml-stamp/extern_mklml-download
|
||||
|
||||
// 4. then compile
|
||||
|
||||
|
@ -0,0 +1,46 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Dataset package.
|
||||
"""
|
||||
|
||||
import mnist
|
||||
import imikolov
|
||||
import imdb
|
||||
import cifar
|
||||
import movielens
|
||||
import conll05
|
||||
import uci_housing
|
||||
import sentiment
|
||||
import wmt14
|
||||
import wmt16
|
||||
import mq2007
|
||||
import flowers
|
||||
import voc2012
|
||||
|
||||
__all__ = [
|
||||
'mnist',
|
||||
'imikolov',
|
||||
'imdb',
|
||||
'cifar',
|
||||
'movielens',
|
||||
'conll05',
|
||||
'sentiment',
|
||||
'uci_housing',
|
||||
'wmt14',
|
||||
'wmt16',
|
||||
'mq2007',
|
||||
'flowers',
|
||||
'voc2012',
|
||||
]
|
@ -0,0 +1,139 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
CIFAR dataset.
|
||||
|
||||
This module will download dataset from
|
||||
https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
|
||||
paddle reader creators.
|
||||
|
||||
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
|
||||
with 6000 images per class. There are 50000 training images and 10000 test
|
||||
images.
|
||||
|
||||
The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
|
||||
containing 600 images each. There are 500 training images and 100 testing
|
||||
images per class.
|
||||
|
||||
"""
|
||||
|
||||
import cPickle
|
||||
import itertools
|
||||
import numpy
|
||||
import paddle.v2.dataset.common
|
||||
import tarfile
|
||||
|
||||
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
|
||||
|
||||
URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
|
||||
CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
|
||||
CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
|
||||
CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
|
||||
CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
|
||||
|
||||
|
||||
def reader_creator(filename, sub_name):
|
||||
def read_batch(batch):
|
||||
data = batch['data']
|
||||
labels = batch.get('labels', batch.get('fine_labels', None))
|
||||
assert labels is not None
|
||||
for sample, label in itertools.izip(data, labels):
|
||||
yield (sample / 255.0).astype(numpy.float32), int(label)
|
||||
|
||||
def reader():
|
||||
with tarfile.open(filename, mode='r') as f:
|
||||
names = (each_item.name for each_item in f
|
||||
if sub_name in each_item.name)
|
||||
|
||||
for name in names:
|
||||
batch = cPickle.load(f.extractfile(name))
|
||||
for item in read_batch(batch):
|
||||
yield item
|
||||
|
||||
return reader
|
||||
|
||||
|
||||
def train100():
|
||||
"""
|
||||
CIFAR-100 training set creator.
|
||||
|
||||
It returns a reader creator, each sample in the reader is image pixels in
|
||||
[0, 1] and label in [0, 99].
|
||||
|
||||
:return: Training reader creator
|
||||
:rtype: callable
|
||||
"""
|
||||
return reader_creator(
|
||||
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
|
||||
'train')
|
||||
|
||||
|
||||
def test100():
|
||||
"""
|
||||
CIFAR-100 test set creator.
|
||||
|
||||
It returns a reader creator, each sample in the reader is image pixels in
|
||||
[0, 1] and label in [0, 9].
|
||||
|
||||
:return: Test reader creator.
|
||||
:rtype: callable
|
||||
"""
|
||||
return reader_creator(
|
||||
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
|
||||
'test')
|
||||
|
||||
|
||||
def train10():
|
||||
"""
|
||||
CIFAR-10 training set creator.
|
||||
|
||||
It returns a reader creator, each sample in the reader is image pixels in
|
||||
[0, 1] and label in [0, 9].
|
||||
|
||||
:return: Training reader creator
|
||||
:rtype: callable
|
||||
"""
|
||||
return reader_creator(
|
||||
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
|
||||
'data_batch')
|
||||
|
||||
|
||||
def test10():
|
||||
"""
|
||||
CIFAR-10 test set creator.
|
||||
|
||||
It returns a reader creator, each sample in the reader is image pixels in
|
||||
[0, 1] and label in [0, 9].
|
||||
|
||||
:return: Test reader creator.
|
||||
:rtype: callable
|
||||
"""
|
||||
return reader_creator(
|
||||
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
|
||||
'test_batch')
|
||||
|
||||
|
||||
def fetch():
|
||||
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
|
||||
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
|
||||
|
||||
|
||||
def convert(path):
|
||||
"""
|
||||
Converts dataset to recordio format
|
||||
"""
|
||||
paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
|
||||
paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
|
||||
paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
|
||||
paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
|
@ -0,0 +1,236 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
import errno
|
||||
import shutil
|
||||
import sys
|
||||
import importlib
|
||||
import paddle.v2.dataset
|
||||
import cPickle
|
||||
import glob
|
||||
import cPickle as pickle
|
||||
|
||||
__all__ = [
|
||||
'DATA_HOME',
|
||||
'download',
|
||||
'md5file',
|
||||
'split',
|
||||
'cluster_files_reader',
|
||||
'convert',
|
||||
]
|
||||
|
||||
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
|
||||
|
||||
|
||||
# When running unit tests, there could be multiple processes that
|
||||
# trying to create DATA_HOME directory simultaneously, so we cannot
|
||||
# use a if condition to check for the existence of the directory;
|
||||
# instead, we use the filesystem as the synchronization mechanism by
|
||||
# catching returned errors.
|
||||
def must_mkdirs(path):
|
||||
try:
|
||||
os.makedirs(DATA_HOME)
|
||||
except OSError as exc:
|
||||
if exc.errno != errno.EEXIST:
|
||||
raise
|
||||
pass
|
||||
|
||||
|
||||
must_mkdirs(DATA_HOME)
|
||||
|
||||
|
||||
def md5file(fname):
|
||||
hash_md5 = hashlib.md5()
|
||||
f = open(fname, "rb")
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
f.close()
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
|
||||
def download(url, module_name, md5sum, save_name=None):
|
||||
dirname = os.path.join(DATA_HOME, module_name)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
|
||||
filename = os.path.join(dirname,
|
||||
url.split('/')[-1]
|
||||
if save_name is None else save_name)
|
||||
|
||||
retry = 0
|
||||
retry_limit = 3
|
||||
while not (os.path.exists(filename) and md5file(filename) == md5sum):
|
||||
if os.path.exists(filename):
|
||||
print "file md5", md5file(filename), md5sum
|
||||
if retry < retry_limit:
|
||||
retry += 1
|
||||
else:
|
||||
raise RuntimeError("Cannot download {0} within retry limit {1}".
|
||||
format(url, retry_limit))
|
||||
print "Cache file %s not found, downloading %s" % (filename, url)
|
||||
r = requests.get(url, stream=True)
|
||||
total_length = r.headers.get('content-length')
|
||||
|
||||
if total_length is None:
|
||||
with open(filename, 'w') as f:
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
else:
|
||||
with open(filename, 'w') as f:
|
||||
dl = 0
|
||||
total_length = int(total_length)
|
||||
for data in r.iter_content(chunk_size=4096):
|
||||
dl += len(data)
|
||||
f.write(data)
|
||||
done = int(50 * dl / total_length)
|
||||
sys.stdout.write("\r[%s%s]" % ('=' * done,
|
||||
' ' * (50 - done)))
|
||||
sys.stdout.flush()
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def fetch_all():
|
||||
for module_name in filter(lambda x: not x.startswith("__"),
|
||||
dir(paddle.v2.dataset)):
|
||||
if "fetch" in dir(
|
||||
importlib.import_module("paddle.v2.dataset.%s" % module_name)):
|
||||
getattr(
|
||||
importlib.import_module("paddle.v2.dataset.%s" % module_name),
|
||||
"fetch")()
|
||||
|
||||
|
||||
def fetch_all_recordio(path):
|
||||
for module_name in filter(lambda x: not x.startswith("__"),
|
||||
dir(paddle.v2.dataset)):
|
||||
if "convert" in dir(
|
||||
importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
|
||||
not module_name == "common":
|
||||
ds_path = os.path.join(path, module_name)
|
||||
must_mkdirs(ds_path)
|
||||
getattr(
|
||||
importlib.import_module("paddle.v2.dataset.%s" % module_name),
|
||||
"convert")(ds_path)
|
||||
|
||||
|
||||
def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
|
||||
"""
|
||||
you can call the function as:
|
||||
|
||||
split(paddle.v2.dataset.cifar.train10(), line_count=1000,
|
||||
suffix="imikolov-train-%05d.pickle")
|
||||
|
||||
the output files as:
|
||||
|
||||
|-imikolov-train-00000.pickle
|
||||
|-imikolov-train-00001.pickle
|
||||
|- ...
|
||||
|-imikolov-train-00480.pickle
|
||||
|
||||
:param reader: is a reader creator
|
||||
:param line_count: line count for each file
|
||||
:param suffix: the suffix for the output files, should contain "%d"
|
||||
means the id for each file. Default is "%05d.pickle"
|
||||
:param dumper: is a callable function that dump object to file, this
|
||||
function will be called as dumper(obj, f) and obj is the object
|
||||
will be dumped, f is a file object. Default is cPickle.dump.
|
||||
"""
|
||||
if not callable(dumper):
|
||||
raise TypeError("dumper should be callable.")
|
||||
lines = []
|
||||
indx_f = 0
|
||||
for i, d in enumerate(reader()):
|
||||
lines.append(d)
|
||||
if i >= line_count and i % line_count == 0:
|
||||
with open(suffix % indx_f, "w") as f:
|
||||
dumper(lines, f)
|
||||
lines = []
|
||||
indx_f += 1
|
||||
if lines:
|
||||
with open(suffix % indx_f, "w") as f:
|
||||
dumper(lines, f)
|
||||
|
||||
|
||||
def cluster_files_reader(files_pattern,
|
||||
trainer_count,
|
||||
trainer_id,
|
||||
loader=cPickle.load):
|
||||
"""
|
||||
Create a reader that yield element from the given files, select
|
||||
a file set according trainer count and trainer_id
|
||||
|
||||
:param files_pattern: the files which generating by split(...)
|
||||
:param trainer_count: total trainer count
|
||||
:param trainer_id: the trainer rank id
|
||||
:param loader: is a callable function that load object from file, this
|
||||
function will be called as loader(f) and f is a file object.
|
||||
Default is cPickle.load
|
||||
"""
|
||||
|
||||
def reader():
|
||||
if not callable(loader):
|
||||
raise TypeError("loader should be callable.")
|
||||
file_list = glob.glob(files_pattern)
|
||||
file_list.sort()
|
||||
my_file_list = []
|
||||
for idx, fn in enumerate(file_list):
|
||||
if idx % trainer_count == trainer_id:
|
||||
print "append file: %s" % fn
|
||||
my_file_list.append(fn)
|
||||
for fn in my_file_list:
|
||||
with open(fn, "r") as f:
|
||||
lines = loader(f)
|
||||
for line in lines:
|
||||
yield line
|
||||
|
||||
return reader
|
||||
|
||||
|
||||
def convert(output_path, reader, line_count, name_prefix):
|
||||
import recordio
|
||||
"""
|
||||
Convert data from reader to recordio format files.
|
||||
|
||||
:param output_path: directory in which output files will be saved.
|
||||
:param reader: a data reader, from which the convert program will read
|
||||
data instances.
|
||||
:param name_prefix: the name prefix of generated files.
|
||||
:param max_lines_to_shuffle: the max lines numbers to shuffle before
|
||||
writing.
|
||||
"""
|
||||
|
||||
assert line_count >= 1
|
||||
indx_f = 0
|
||||
|
||||
def write_data(indx_f, lines):
|
||||
filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
|
||||
writer = recordio.writer(filename)
|
||||
for l in lines:
|
||||
# FIXME(Yancey1989):
|
||||
# dumps with protocol: pickle.HIGHEST_PROTOCOL
|
||||
writer.write(cPickle.dumps(l))
|
||||
writer.close()
|
||||
|
||||
lines = []
|
||||
for i, d in enumerate(reader()):
|
||||
lines.append(d)
|
||||
if i % line_count == 0 and i >= line_count:
|
||||
write_data(indx_f, lines)
|
||||
lines = []
|
||||
indx_f += 1
|
||||
continue
|
||||
|
||||
write_data(indx_f, lines)
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue