Merge branch 'develop' into core_inference_prepare

wangkuiyi-patch-2
Liu Yiqun 7 years ago
commit bf485999f4

@ -48,6 +48,13 @@ parser.add_argument(
type=int,
default=16,
help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
"--dict_size",
type=int,
@ -72,16 +79,21 @@ parser.add_argument(
default=3,
help="The width for beam searching. (default: %(default)d)")
parser.add_argument(
"--use_gpu",
type=distutils.util.strtobool,
default=True,
help="Whether to use gpu. (default: %(default)d)")
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument(
"--max_length",
type=int,
default=250,
help="The maximum length of sequence when doing generation. "
"(default: %(default)d)")
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
@ -281,7 +293,7 @@ def train():
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = Executor(place)
exe.run(framework.default_startup_program())
@ -307,14 +319,20 @@ def train():
return total_loss / count
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in xrange(args.pass_num):
pass_start_time = time.time()
words_seen = 0
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_batch_generator()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
words_seen += word_num
num_samples += word_num
trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
words_seen += word_num
num_samples += word_num
lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
fetch_outs = exe.run(framework.default_main_program(),
@ -325,24 +343,36 @@ def train():
},
fetch_list=[avg_cost])
avg_cost_val = np.array(fetch_outs[0])
print('pass_id=%d, batch_id=%d, train_loss: %f' %
(pass_id, batch_id, avg_cost_val))
iters += 1
loss = np.array(fetch_outs[0])
print(
"Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_end_time = time.time()
test_loss = do_validation()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
test_loss = do_validation()
exit(0)
def infer():
pass
def print_arguments(args):
print('----------- seq2seq Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
if args.infer_only:
infer()
else:

@ -35,6 +35,12 @@ def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument(
@ -53,19 +59,14 @@ def parse_args():
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args()
return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
@ -138,9 +139,6 @@ def run_benchmark(model, args):
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
# Optimization
opt = fluid.optimizer.AdamOptimizer(
@ -160,39 +158,60 @@ def run_benchmark(model, args):
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
accuracy = fluid.average.WeightedAverage()
accuracy = fluid.metrics.Accuracy()
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
accuracy.reset()
pass_start = time.time()
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
img_data = np.array(
map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([len(y_data), 1])
start = time.time()
outs = exe.run(
fluid.default_main_program(),
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size_tensor]
) # The accuracy is the accumulation of batches, but not the current batch.
accuracy.add(value=outs[1], weight=outs[2])
end = time.time()
accuracy.update(value=outs[1], weight=outs[2])
iters += 1
num_samples += len(y_data)
loss = np.array(outs[0])
acc = np.array(outs[1])
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
train_losses.append(loss)
train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
(pass_id, iters, loss, acc))
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
pass_end = time.time()
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
inference_program)
exit(0)
train_avg_acc = accuracy.eval()
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
inference_program)
print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
(pass_id, train_avg_acc, test_avg_acc,
(pass_end - pass_start) / 1000))
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- mnist Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':

@ -87,15 +87,6 @@ def parse_args():
return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
conv1 = fluid.layers.conv2d(
input=input,
@ -279,32 +270,31 @@ def run_benchmark(model, args):
'label': label},
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
iters += 1
num_samples += label[0]
num_samples += len(label)
accuracy.add(value=acc, weight=weight)
train_losses.append(loss)
train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
(pass_id, iters, loss, acc))
pass_train_acc = accuracy.eval()
# evaluation
if args.with_test:
pass_test_acc = test(exe)
train_elapsed = time.time() - start_time
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
pass_test_acc = test(exe)
exit(0)
if args.use_cprof:
pr.disable()
s = StringIO.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- resnet Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':

@ -1,7 +1,9 @@
#!/bin/bash
# This script benchmarking the PaddlePaddle Fluid on
# single thread single GPU.
export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
export CUDNN_PATH=/paddle/cudnn_v5
# disable openmp and mkl parallel
#https://github.com/PaddlePaddle/Paddle/issues/7199
@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
# only query the gpu used
nohup stdbuf -oL nvidia-smi \
--id=${CUDA_VISIBLE_DEVICES} \
--query-gpu=timestamp \
--query-compute-apps=pid,process_name,used_memory \
--format=csv \
--filename=mem.log \
-l 1 &
# mnist
# mnist gpu mnist 128
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=500 \
2>&1 | tee -a mnist_gpu_128.log
# vgg16
# cifar10 gpu cifar10 128
FLAGS_benchmark=true python fluid/vgg.py \
# gpu cifar10 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 > vgg16_gpu_128.log
--iterations=30 \
2>&1 | tee -a vgg16_gpu_128.log
# flowers gpu 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a vgg16_gpu_flowers_32.log
# resnet50
# resnet50 gpu cifar10 128
FLAGS_benchmark=true python fluid/resnet.py \
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \
--batch_size=128 \
--data_set=cifar10 \
--model=resnet_cifar10 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 > resnet50_gpu_128.log
2>&1 | tee -a resnet50_gpu_128.log
# resnet50 gpu flowers 64
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
--model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_flowers_64.log
# lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
--hidden_dim=512 \
--emb_dim=512 \
--crop_size=1500 \
2>&1 | tee -a lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a lstm_gpu_128.log

@ -37,6 +37,14 @@ def parse_args():
type=int,
default=32,
help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--emb_dim',
type=int,
@ -64,6 +72,10 @@ def parse_args():
default=int(os.environ.get('CROP_SIZE', '1500')),
help='The max sentence length of input. Since this model use plain RNN,'
' Gradient could be explored if sentence is too long')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args()
return args
@ -157,37 +169,43 @@ def main():
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
def train_loop(pass_num, crop_size):
with profiler.profiler(args.device, 'total') as prof:
for pass_id in range(pass_num):
train_reader = batch(
paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size),
buf_size=25000),
batch_size=args.batch_size)
word_nums = 0
pass_start_time = time.time()
for batch_id, data in enumerate(train_reader()):
tensor_words = to_lodtensor([x[0] for x in data], place)
for x in data:
word_nums += len(x[0])
label = numpy.array([x[1] for x in data]).astype("int64")
label = label.reshape((-1, 1))
loss_np, acc, weight = exe.run(
fluid.default_main_program(),
feed={"words": tensor_words,
"label": label},
fetch_list=[loss, batch_acc, batch_size_tensor])
print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
(pass_id, batch_id, loss_np, acc))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
words_per_sec = word_nums / time_consumed
print("pass_id=%d, sec/pass: %f, words/s: %f" %
(pass_id, time_consumed, words_per_sec))
train_loop(args.pass_num, args.crop_size)
train_reader = batch(
paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), args.crop_size),
buf_size=25000),
batch_size=args.batch_size)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
tensor_words = to_lodtensor([x[0] for x in data], place)
label = numpy.array([x[1] for x in data]).astype("int64")
label = label.reshape((-1, 1))
loss_np, acc, weight = exe.run(
fluid.default_main_program(),
feed={"words": tensor_words,
"label": label},
fetch_list=[loss, batch_acc, batch_size_tensor])
iters += 1
for x in data:
num_samples += len(x[0])
print(
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss_np, acc)
) # The accuracy is the accumulation of batches, but not the current batch.
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
exit(0)
def to_lodtensor(data, place):
@ -205,5 +223,14 @@ def to_lodtensor(data, place):
return res
def print_arguments(args):
print('----------- lstm Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
main()

@ -191,25 +191,29 @@ def main():
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
accuracy.add(value=acc, weight=weight)
iters += 1
num_samples += len(data)
num_samples += len(y_data)
print(
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_train_acc = accuracy.eval()
# pass_train_acc = accuracy.eval()
train_losses.append(loss)
train_accs.append(acc)
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
pass_test_acc = test(exe)
train_elapsed = time.time() - start_time
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
exit(0)
def print_arguments():
print('----------- Configuration Arguments -----------')
print('----------- vgg Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')

File diff suppressed because it is too large Load Diff

@ -0,0 +1,180 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import tensorflow as tf
import paddle.v2 as paddle
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=5, help='The number of passes.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
args = parser.parse_args()
return args
def run_benchmark(args):
def weight_variable(dtype, shape):
initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
return tf.Variable(initial)
def bias_variable(dtype, shape):
initial = tf.constant(0.1, shape=shape, dtype=dtype)
return tf.Variable(initial)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
labels = tf.placeholder(tf.int64, shape=(None, ))
# conv1, relu, pool1
conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
conv1_bias = bias_variable(DTYPE, [20])
conv1 = tf.nn.conv2d(
images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
pool1 = tf.nn.max_pool(
relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# conv2, relu, pool2
conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
conv2_bias = bias_variable(DTYPE, [50])
conv2 = tf.nn.conv2d(
pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
pool2 = tf.nn.max_pool(
relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# FC
pool_shape = pool2.get_shape().as_list()
hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
fc_bias = bias_variable(DTYPE, [10])
logits = tf.matmul(reshape, fc_weights) + fc_bias
# Get prediction
prediction = tf.nn.softmax(logits)
# Loss
one_hot_labels = tf.one_hot(labels, depth=10)
cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
avg_cost = tf.reduce_mean(cost)
# Get accuracy
correct = tf.equal(tf.argmax(prediction, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# metrics, g_accuracy
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_accuracy = tf.metrics.accuracy(
labels, tf.argmax(
prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
g_accuracy_reset_op = tf.variables_initializer(vars)
# Optimizer
opt = tf.train.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
train_op = opt.minimize(avg_cost)
# train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size)
def eval_test():
sess.run(g_accuracy_reset_op)
for batch_id, data in enumerate(test_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
loss, acc, g_acc = sess.run(
[avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
return g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
for pass_id in range(args.pass_num):
sess.run(g_accuracy_reset_op)
pass_start = time.time()
for batch_id, data in enumerate(train_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
"int64")
start = time.time()
_, loss, acc, g_acc = sess.run(
[train_op, avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
end = time.time()
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
pass_end = time.time()
test_avg_acc = eval_test()
print(
"pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
% (pass_id, g_acc[1], test_avg_acc,
(pass_end - pass_start) / 1000))
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
run_benchmark(args)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,220 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import tensorflow as tf
import paddle.v2 as paddle
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--stacked_num',
type=int,
default=5,
help='Number of lstm layers to stack. (default: %(default)d)')
parser.add_argument(
'--embedding_dim',
type=int,
default=512,
help='Dimension of embedding table. (default: %(default)d)')
parser.add_argument(
'--hidden_dim',
type=int,
default=512,
help='Hidden size of lstm unit. (default: %(default)d)')
parser.add_argument(
'--pass_num',
type=int,
default=10,
help='Epoch number to train. (default: %(default)d)')
parser.add_argument(
'--learning_rate',
type=float,
default=0.0002,
help='Learning rate used to train. (default: %(default)f)')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def dynamic_lstm_model(dict_size,
embedding_dim,
hidden_dim,
stacked_num,
class_num=2,
is_train=True):
word_idx = tf.placeholder(tf.int64, shape=[None, None])
sequence_length = tf.placeholder(tf.int64, shape=[None, ])
embedding_weights = tf.get_variable('word_embeddings',
[dict_size, embedding_dim])
embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
lstm_cell = tf.nn.rnn_cell.LSTMCell(
num_units=hidden_dim, use_peepholes=False)
stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
# final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
_, final_state = tf.nn.dynamic_rnn(
cell=stacked_cell,
inputs=embedding,
dtype=tf.float32,
sequence_length=sequence_length)
w = tf.Variable(
tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
bias = tf.Variable(
tf.constant(
value=0.0, shape=[class_num], dtype=tf.float32))
prediction = tf.matmul(final_state[-1][1], w) + bias
if not is_train:
return (word_idx, sequence_length), tf.nn.softmax(prediction)
label = tf.placeholder(tf.int64, shape=[None, ])
loss = tf.nn.softmax_cross_entropy_with_logits(
labels=tf.one_hot(label, 2), logits=prediction)
avg_loss = tf.reduce_mean(loss)
correct_count = tf.equal(tf.argmax(prediction, 1), label)
acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
reset_op = tf.variables_initializer(vars)
return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def train(args):
word_dict = paddle.dataset.imdb.word_dict()
dict_size = len(word_dict)
feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
train_op = adam_optimizer.minimize(avg_loss)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=25000),
batch_size=args.batch_size)
def do_validation(sess):
sess.run(reset_op)
for batch_id, data in enumerate(test_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
return fetch_g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
# clear accuracy local variable
sess.run(reset_op)
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
words_seen += np.sum(sequence_length)
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
% (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
test_acc = do_validation(sess)
print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_acc, words_per_sec, time_consumed))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if args.infer_only:
pass
else:
train(args)

File diff suppressed because it is too large Load Diff

@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})

@ -1,67 +0,0 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject)
set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
if(WITH_DSO)
# If we use DSO, we do not build nccl, just download the dependencies
set(NCCL_BUILD_COMMAND "")
set(NCCL_INSTALL_COMMAND "")
set(NCCL_INSTALL_DIR "")
else()
# otherwise, we build nccl and link it.
set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
# Note: cuda 8.0 is needed to make nccl
# When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
set(NCCL_BUILD_COMMAND "make -j 8")
set(NCCL_INSTALL_COMMAND "make install PREFIX=${NCCL_INSTALL_DIR}")
endif()
ExternalProject_Add(
extern_nccl
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git"
GIT_TAG "v1.3.4-1"
PREFIX "${NCCL_SOURCE_DIR}"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND "${NCCL_BUILD_COMMAND}"
INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}"
INSTALL_DIR "${NCCL_INSTALL_DIR}"
TEST_COMMAND ""
)
if(WITH_DSO)
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
add_library(nccl STATIC ${dummyfile})
else()
add_library(nccl INTERFACE)
endif()
else()
add_library(nccl STATIC IMPORTED GLOBAL)
set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
endif()
add_dependencies(nccl extern_nccl)

@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(nv_test)
@ -387,8 +387,8 @@ function(hip_test TARGET_NAME)
endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(hip_test)

@ -16,3 +16,4 @@
block.md
scope.md
executor.md
parallel_executor.md

@ -16,3 +16,4 @@ Core Concepts
block.md
scope.md
executor.md
parallel_executor.md

@ -1,4 +1,6 @@
# Problem
# Kernel Hint Design
## Problem
In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
In the current design, we use KernelType to describe one kernel.

@ -1,4 +1,6 @@
# Background
# Kernel Selection
## Background
Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
The `OpKernelType ` is as follows:

@ -1,32 +1,56 @@
Install and Build
=================
install and Compile
==========
.. _install_steps:
Install Steps
++++++++
PaddlePaddle provides various methods of installation for many different users
You can choose either pip or Docker to complete your install:
Focus on Deep Learning Model Development
-----------------
PaddlePaddle provides lots of packages of python wheel , that pip can install:
.. toctree::
:maxdepth: 1
:maxdepth: 1
pip_install_en.rst
docker_install_en.rst
pip_install_en.rst
Build from Source
-----------------
This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
Follow the Bottom Frame
----------
PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
.. toctree::
:maxdepth: 1
docker_install_en.rst
.. warning::
We recommend running PaddlePaddle in Docker. This method has the following advantages
We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
- Does not require installation of third-party dependencies.
- Easy to share runtime environment.
.. toctree::
Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
.. toctree::
:maxdepth: 1
build_from_source_en.md
build_from_source_en.rst
.. warning::
One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
FAQ
++++++++++
-----------
For any problems during installation, please refer to the page below for answers:
:ref:`常见问题解答 <install_faq>`
If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community
`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_

@ -65,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用需要在系统里先安装好D
不使用PaddlePaddle.org工具
--------------------------
使用Docker构建PaddlePaddle的文档需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_安装好Docker之后可以使用源码目录下的脚本构建文档
使用Docker构建PaddlePaddle的文档需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档具体步骤如下
[TBD]
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
docker build -t paddle:dev .
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
# 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
bash -x /paddle/paddle/scripts/docker/build.sh
注:上述命令把当前目录(源码根目录)映射为 container 里的 :code:`/paddle` 目录。
编译完成后,会产生 ``doc/v2````doc/fluid`` 两个目录,在这两个目录下分别都生成 ``cn/html/````en/html````api/en/html`` 共三个子目录,分别进入这些目录下,执行以下命令:
.. code-block:: bash
python -m SimpleHTTPServer 8088
在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2````fluid`` 两种版本的中/英文的文档页面和英文的API页面。
如果不想使用Docker也可以使用以下命令直接构建PaddlePaddle文档
.. code-block:: bash
mkdir paddle
cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
# 如果只需要构建使用文档,则执行以下命令
make -j $processors gen_proto_py
make -j $processors paddle_docs paddle_docs_cn
make -j $processors paddle_docs
# 如果只需要构建API则执行以下命令
make -j $processors gen_proto_py framework_py_proto
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
make -j $processors paddle_apis
其中$processors代表启动和CPU核一样多的进程来并行编译可以根据本机的CPU核数设置相应的值。
编译完成后,进入 ``doc/v2`` 目录,如果选择构建文档则会在该目录下生成 ``cn/html/````en/html`` 两个子目录选择构建API则会生成 ``api/en/html`` 目录,分别进入这些目录下,执行以下命令:
编译完成后,同样会产生 ``doc/v2````doc/fluid`` 两个目录,如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/````en/html`` 两个子目录选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录,分别进入这些目录下,执行以下命令:
.. code-block:: bash
python -m SimpleHTTPServer 8088
在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意示例中由于使用了sphinx的原始主题所以页面的风格与官网并不一致但这并不影响开发者进行调试。
在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2````fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意示例中由于使用了sphinx的原始主题所以页面的风格与官网并不一致但这并不影响开发者进行调试。
.. image:: src/doc_en.png
:align: center

@ -68,39 +68,56 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo
Manually Building the Documentation
-------------------------------------
Build PaddlePaddle's documentation with Dockeryou need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
Build PaddlePaddle's documentation with Dockeryou need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
[TBD]
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# Construct a docker image from source code
docker build -t paddle:dev .
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
# Use build.sh to build PaddlePaddle documentation
bash -x /paddle/paddle/scripts/docker/build.sh
Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
.. code-block:: bash
python -m SimpleHTTPServer 8088
Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
.. code-block:: bash
mkdir paddle
cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
# If you only need to build documents, use the following commands
make -j $processors gen_proto_py
make -j $processors paddle_docs paddle_docs_cn
make -j $processors paddle_docs
# If you only need to build APIs, use the following commands
make -j $processors gen_proto_py framework_py_proto
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
make -j $processors paddle_apis
$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIsit will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html`` will be generated in both two directories. If you chose to build APIsa subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
.. code-block:: bash
python -m SimpleHTTPServer 8088
Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
.. image:: src/doc_en.png
:align: center

@ -1,3 +1,175 @@
## Install and Build
TBD
### Download & Install
Download the latest C-API development package from CI system and install. You can find the required version in the table below:
<table>
<thead>
<tr>
<th>Version Tips</th>
<th>C-API</th>
</tr>
</thead>
<tbody>
<tr>
<td>cpu_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cpu_avx_openblas</td>
<td>-</td>
</tr>
<tr>
<td>cpu_noavx_openblas</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda7.5_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn7_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr></tbody></table>
### From source
Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options:
<table>
<thead>
<tr>
<th>Options</th>
<th>Value</th>
</tr>
</thead>
<tbody>
<tr>
<td>WITH_C_API</td>
<td>ON</td>
</tr>
<tr>
<td>WITH_PYTHON</td>
<td>OFFrecommended</td>
</tr>
<tr>
<td>WITH_SWIG_PY</td>
<td>OFFrecommended</td>
</tr>
<tr>
<td>WITH_GOLANG</td>
<td>OFFrecommended</td>
</tr>
<tr>
<td>WITH_GPU</td>
<td>ON/OFF</td>
</tr>
<tr>
<td>WITH_MKL</td>
<td>ON/OFF</td>
</tr></tbody></table>
It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need.
Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library):
```shell
PADDLE_ROOT=/path/of/capi
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir build
cd build
cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_GOLANG=OFF \
-DWITH_PYTHON=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=OFF \
..
```
After running the above code to generate Makefile , run: `make && make install`. After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory.
If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)):
```text
├── include
│   └── paddle
│   ├── arguments.h
│   ├── capi.h
│   ├── capi_private.h
│   ├── config.h
│   ├── error.h
│   ├── gradient_machine.h
│   ├── main.h
│   ├── matrix.h
│   ├── paddle_capi.map
│   └── vector.h
├── lib
│   ├── libpaddle_capi_engine.a
│   ├── libpaddle_capi_layers.a
│   ├── libpaddle_capi_shared.so
│   └── libpaddle_capi_whole.a
└── third_party
├── gflags
│   ├── include
│   │   └── gflags
│   │   ├── gflags_completions.h
│   │   ├── gflags_declare.h
│   │   ...
│   └── lib
│   └── libgflags.a
├── glog
│   ├── include
│   │   └── glog
│   │   ├── config.h
│   │   ...
│   └── lib
│   └── libglog.a
├── openblas
│   ├── include
│   │   ├── cblas.h
│   │   ...
│   └── lib
│   ...
├── protobuf
│   ├── include
│   │   └── google
│   │   └── protobuf
│   │   ...
│   └── lib
│   └── libprotobuf-lite.a
└── zlib
├── include
│   ...
└── lib
...
```
### Linking Description:
There are three kinds of linking methods:
1. Linking with dynamic library `libpaddle_capi_shared.so`This way is much more convenient and easier, **Without special requirements, it is recommended**, refer to the following
1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API.
1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library.
1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable.
2. Linking with static library `libpaddle_capi_whole.a`refer to the following
1. Specify `-Wl,--whole-archive` linking options.
1. Explicitly link third-party libraries such as `gflags`、`glog`、`libz`、`protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory.
1. Use OpenBLAS library if compiling C-APImust explicitly link `libopenblas.a`.
1. Use MKL when compiling C-API, must explicitly link MKL dynamic library.
3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`refer to the following
1. This linking methods is mainly used for mobile prediction.
1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries.
1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking.
1. The third-party dependencies need explicitly link same as method 2 above.

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save