Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_new_backward
commit
febb725102
@ -0,0 +1,64 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
function clock_to_seconds() {
|
||||||
|
hours=`echo $1 | awk -F ':' '{print $1}'`
|
||||||
|
mins=`echo $1 | awk -F ':' '{print $2}'`
|
||||||
|
secs=`echo $1 | awk -F ':' '{print $3}'`
|
||||||
|
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
|
||||||
|
}
|
||||||
|
|
||||||
|
function infer() {
|
||||||
|
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||||
|
topology=$1
|
||||||
|
layer_num=$2
|
||||||
|
bs=$3
|
||||||
|
thread=`nproc`
|
||||||
|
if [ $thread -gt $bs ]; then
|
||||||
|
thread=$bs
|
||||||
|
fi
|
||||||
|
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||||
|
|
||||||
|
models_in="models/${topology}-${layer_num}/pass-00000/"
|
||||||
|
if [ ! -d $models_in ]; then
|
||||||
|
echo "./run_mkl_infer.sh to save the model first"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
log_period=$((32 / bs))
|
||||||
|
paddle train --job=test \
|
||||||
|
--config="${topology}.py" \
|
||||||
|
--use_mkldnn=False \
|
||||||
|
--use_gpu=False \
|
||||||
|
--trainer_count=$thread \
|
||||||
|
--log_period=$log_period \
|
||||||
|
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
|
||||||
|
--init_model_path=$models_in \
|
||||||
|
2>&1 | tee ${log}
|
||||||
|
|
||||||
|
# calculate the last 5 logs period time of 160(=32*5) samples,
|
||||||
|
# the time before are burning time.
|
||||||
|
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||||
|
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||||
|
start_sec=`clock_to_seconds $start`
|
||||||
|
end_sec=`clock_to_seconds $end`
|
||||||
|
fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
|
||||||
|
echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
|
||||||
|
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "train.list" ]; then
|
||||||
|
echo " " > train.list
|
||||||
|
fi
|
||||||
|
if [ ! -f "test.list" ]; then
|
||||||
|
echo " " > test.list
|
||||||
|
fi
|
||||||
|
if [ ! -d "logs" ]; then
|
||||||
|
mkdir logs
|
||||||
|
fi
|
||||||
|
|
||||||
|
# inference benchmark
|
||||||
|
for batchsize in 1 2 4 8 16; do
|
||||||
|
infer vgg 19 $batchsize
|
||||||
|
infer resnet 50 $batchsize
|
||||||
|
infer googlenet v1 $batchsize
|
||||||
|
infer alexnet 2 $batchsize
|
||||||
|
done
|
@ -0,0 +1,41 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
function train() {
|
||||||
|
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||||
|
topology=$1
|
||||||
|
layer_num=$2
|
||||||
|
bs=$3
|
||||||
|
thread=`nproc`
|
||||||
|
# each trainer_count use only 1 core to avoid conflict
|
||||||
|
log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||||
|
args="batch_size=${bs},layer_num=${layer_num}"
|
||||||
|
config="${topology}.py"
|
||||||
|
paddle train --job=time \
|
||||||
|
--config=$config \
|
||||||
|
--use_mkldnn=False \
|
||||||
|
--use_gpu=False \
|
||||||
|
--trainer_count=$thread \
|
||||||
|
--log_period=3 \
|
||||||
|
--test_period=30 \
|
||||||
|
--config_args=$args \
|
||||||
|
2>&1 | tee ${log}
|
||||||
|
|
||||||
|
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
|
||||||
|
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
|
||||||
|
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "train.list" ]; then
|
||||||
|
echo " " > train.list
|
||||||
|
fi
|
||||||
|
if [ ! -d "logs" ]; then
|
||||||
|
mkdir logs
|
||||||
|
fi
|
||||||
|
|
||||||
|
# training benchmark
|
||||||
|
for batchsize in 64 128 256; do
|
||||||
|
train vgg 19 $batchsize
|
||||||
|
train resnet 50 $batchsize
|
||||||
|
train googlenet v1 $batchsize
|
||||||
|
train alexnet 2 $batchsize
|
||||||
|
done
|
File diff suppressed because it is too large
Load Diff
@ -1,23 +1,29 @@
|
|||||||
# Executor Design Doc
|
# Executor Design Doc
|
||||||
|
|
||||||
## Motivation
|
## Motivation
|
||||||
|
In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
|
||||||
|
[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
|
||||||
|
|
||||||
We use executor to do the runtime evaluation of a `ProgramDesc`.
|
The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
|
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
|
||||||
|
|
||||||
### What does executor do?
|
## Executor
|
||||||
|
|
||||||
It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
|
The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
|
||||||
|
It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
|
||||||
|
|
||||||
### What does executor NOT do?
|
### The interface
|
||||||
|
```c++
|
||||||
|
Executor(places);
|
||||||
|
```
|
||||||
|
A executor does not own any computing resources, a user can only construct an executor using the specified places.
|
||||||
|
|
||||||
It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
|
### Running an Executor
|
||||||
|
|
||||||
It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
|
```
|
||||||
|
void Run(ProgramDesc, Scope, block_id, create_local_scope);
|
||||||
## Implementation
|
```
|
||||||
|
An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
|
||||||
`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
|
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue