commit
6dc0e663f4
After Width: | Height: | Size: 18 KiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 18 KiB |
@ -0,0 +1,62 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
function clock_to_seconds() {
|
||||||
|
hours=`echo $1 | awk -F ':' '{print $1}'`
|
||||||
|
mins=`echo $1 | awk -F ':' '{print $2}'`
|
||||||
|
secs=`echo $1 | awk -F ':' '{print $3}'`
|
||||||
|
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
|
||||||
|
}
|
||||||
|
|
||||||
|
function infer() {
|
||||||
|
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||||
|
topology=$1
|
||||||
|
layer_num=$2
|
||||||
|
bs=$3
|
||||||
|
thread=`nproc`
|
||||||
|
if [ $thread -gt $bs ]; then
|
||||||
|
thread=$bs
|
||||||
|
fi
|
||||||
|
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||||
|
|
||||||
|
models_in="models/${topology}-${layer_num}/pass-00000/"
|
||||||
|
if [ ! -d $models_in ]; then
|
||||||
|
echo "./run_mkl_infer.sh to save the model first"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
log_period=$((256 / bs))
|
||||||
|
paddle train --job=test \
|
||||||
|
--config="${topology}.py" \
|
||||||
|
--use_gpu=False \
|
||||||
|
--trainer_count=$thread \
|
||||||
|
--log_period=$log_period \
|
||||||
|
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
|
||||||
|
--init_model_path=$models_in \
|
||||||
|
2>&1 | tee ${log}
|
||||||
|
|
||||||
|
# calculate the last 5 logs period time of 1280 samples,
|
||||||
|
# the time before are burning time.
|
||||||
|
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||||
|
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||||
|
start_sec=`clock_to_seconds $start`
|
||||||
|
end_sec=`clock_to_seconds $end`
|
||||||
|
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
|
||||||
|
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
|
||||||
|
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "train.list" ]; then
|
||||||
|
echo " " > train.list
|
||||||
|
fi
|
||||||
|
if [ ! -f "test.list" ]; then
|
||||||
|
echo " " > test.list
|
||||||
|
fi
|
||||||
|
if [ ! -d "logs" ]; then
|
||||||
|
mkdir logs
|
||||||
|
fi
|
||||||
|
|
||||||
|
# inference benchmark
|
||||||
|
for batchsize in 1 2 4 8 16; do
|
||||||
|
infer googlenet v1 $batchsize
|
||||||
|
infer resnet 50 $batchsize
|
||||||
|
infer vgg 19 $batchsize
|
||||||
|
done
|
@ -0,0 +1,39 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
function train() {
|
||||||
|
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||||
|
topology=$1
|
||||||
|
layer_num=$2
|
||||||
|
bs=$3
|
||||||
|
thread=`nproc`
|
||||||
|
# each trainer_count use only 1 core to avoid conflict
|
||||||
|
log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||||
|
args="batch_size=${bs},layer_num=${layer_num}"
|
||||||
|
config="${topology}.py"
|
||||||
|
paddle train --job=time \
|
||||||
|
--config=$config \
|
||||||
|
--use_gpu=False \
|
||||||
|
--trainer_count=$thread \
|
||||||
|
--log_period=10 \
|
||||||
|
--test_period=100 \
|
||||||
|
--config_args=$args \
|
||||||
|
2>&1 | tee ${log}
|
||||||
|
|
||||||
|
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
|
||||||
|
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
|
||||||
|
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "train.list" ]; then
|
||||||
|
echo " " > train.list
|
||||||
|
fi
|
||||||
|
if [ ! -d "logs" ]; then
|
||||||
|
mkdir logs
|
||||||
|
fi
|
||||||
|
|
||||||
|
# training benchmark
|
||||||
|
for batchsize in 64 128 256; do
|
||||||
|
train vgg 19 $batchsize
|
||||||
|
train resnet 50 $batchsize
|
||||||
|
train googlenet v1 $batchsize
|
||||||
|
done
|
@ -1,23 +1,29 @@
|
|||||||
# Executor Design Doc
|
# Executor Design Doc
|
||||||
|
|
||||||
## Motivation
|
## Motivation
|
||||||
|
In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
|
||||||
|
[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
|
||||||
|
|
||||||
We use executor to do the runtime evaluation of a `ProgramDesc`.
|
The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
|
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
|
||||||
|
|
||||||
### What does executor do?
|
## Executor
|
||||||
|
|
||||||
It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
|
The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
|
||||||
|
It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
|
||||||
|
|
||||||
### What does executor NOT do?
|
### The interface
|
||||||
|
```c++
|
||||||
|
Executor(places);
|
||||||
|
```
|
||||||
|
A executor does not own any computing resources, a user can only construct an executor using the specified places.
|
||||||
|
|
||||||
It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
|
### Running an Executor
|
||||||
|
|
||||||
It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
|
```
|
||||||
|
void Run(ProgramDesc, Scope, block_id, create_local_scope);
|
||||||
## Implementation
|
```
|
||||||
|
An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
|
||||||
`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
|
|
||||||
|
Binary file not shown.
After Width: | Height: | Size: 121 KiB |
@ -0,0 +1,122 @@
|
|||||||
|
# Design Doc: PaddlePaddle Fluid
|
||||||
|
|
||||||
|
## Why Fluid
|
||||||
|
|
||||||
|
When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
|
||||||
|
|
||||||
|
Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
|
||||||
|
|
||||||
|
## The Evolution of Deep Learning Systems
|
||||||
|
|
||||||
|
Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
|
||||||
|
|
||||||
|
| Existed since | model as sequence of layers | model as graph of operators | No model |
|
||||||
|
|--|--|--|--|
|
||||||
|
| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
|
||||||
|
| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
|
||||||
|
| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
|
||||||
|
|
||||||
|
From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
|
||||||
|
|
||||||
|
## Deep Learning Programming Paradigms
|
||||||
|
|
||||||
|
With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
|
||||||
|
|
||||||
|
```python
|
||||||
|
x = layer.data("image")
|
||||||
|
l = layer.data("label")
|
||||||
|
f = layer.fc(x, W)
|
||||||
|
s = layer.softmax(f)
|
||||||
|
c = layer.mse(l, s)
|
||||||
|
|
||||||
|
for i in xrange(1000): # train for 1000 iterations
|
||||||
|
m = read_minibatch()
|
||||||
|
forward({input=x, data=m}, minimize=c)
|
||||||
|
backward(...)
|
||||||
|
|
||||||
|
print W # print the trained model parameters.
|
||||||
|
```
|
||||||
|
|
||||||
|
The above program includes two parts:
|
||||||
|
|
||||||
|
1. The first part describes the model, and
|
||||||
|
2. The second part describes the training process (or inference process) for the model.
|
||||||
|
|
||||||
|
This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
|
||||||
|
|
||||||
|
This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following:
|
||||||
|
|
||||||
|
```python
|
||||||
|
W = tensor(...)
|
||||||
|
|
||||||
|
for i in xrange(1000): # train for 1000 iterations
|
||||||
|
m = read_minibatch()
|
||||||
|
x = m["image"]
|
||||||
|
l = m["label"]
|
||||||
|
f = layer.fc(x, W)
|
||||||
|
s = layer.softmax(f)
|
||||||
|
c = layer.mse(l, s)
|
||||||
|
backward()
|
||||||
|
|
||||||
|
print W # print the trained model parameters.
|
||||||
|
```
|
||||||
|
|
||||||
|
We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
|
||||||
|
|
||||||
|
## Describe Arbitrary Models for the Future
|
||||||
|
|
||||||
|
Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
|
||||||
|
|
||||||
|
As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for i in xrange(1000):
|
||||||
|
m = read_minibatch()
|
||||||
|
x = m["sentence"]
|
||||||
|
for t in xrange x.len():
|
||||||
|
h[t] = the_step(x[t])
|
||||||
|
```
|
||||||
|
|
||||||
|
With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
|
||||||
|
|
||||||
|
```python
|
||||||
|
train_loop = layers.While(cond)
|
||||||
|
with train_loop.block():
|
||||||
|
m = read_minibatch()
|
||||||
|
x = m["sentence"]
|
||||||
|
rnn = layers.While(...)
|
||||||
|
with rnn.block():
|
||||||
|
h[t] = the_step(input[t])
|
||||||
|
```
|
||||||
|
|
||||||
|
An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
|
||||||
|
|
||||||
|
From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
|
||||||
|
|
||||||
|
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
|
||||||
|
|
||||||
|
## Turing Completeness
|
||||||
|
|
||||||
|
In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
|
||||||
|
|
||||||
|
## The Execution of a Fluid Program
|
||||||
|
|
||||||
|
There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
|
||||||
|
|
||||||
|
There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
|
||||||
|
|
||||||
|
Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
|
||||||
|
|
||||||
|
## Backward Compatibility of Fluid
|
||||||
|
|
||||||
|
Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
|
||||||
|
|
||||||
|
For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
|
||||||
|
|
||||||
|
## Towards a Deep Learning Language and the Compiler
|
||||||
|
|
||||||
|
We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
|
||||||
|
|
||||||
|
Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`. Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel. More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server. For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer. The following figure explains the proposed two-stage process:
|
||||||
|
|
||||||
|
![](fluid-compiler.png)
|
Binary file not shown.
After Width: | Height: | Size: 108 KiB |
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue