Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_new_backward
commit
febb725102
@ -0,0 +1,64 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
function clock_to_seconds() {
|
||||||
|
hours=`echo $1 | awk -F ':' '{print $1}'`
|
||||||
|
mins=`echo $1 | awk -F ':' '{print $2}'`
|
||||||
|
secs=`echo $1 | awk -F ':' '{print $3}'`
|
||||||
|
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
|
||||||
|
}
|
||||||
|
|
||||||
|
function infer() {
|
||||||
|
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||||
|
topology=$1
|
||||||
|
layer_num=$2
|
||||||
|
bs=$3
|
||||||
|
thread=`nproc`
|
||||||
|
if [ $thread -gt $bs ]; then
|
||||||
|
thread=$bs
|
||||||
|
fi
|
||||||
|
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||||
|
|
||||||
|
models_in="models/${topology}-${layer_num}/pass-00000/"
|
||||||
|
if [ ! -d $models_in ]; then
|
||||||
|
echo "./run_mkl_infer.sh to save the model first"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
log_period=$((32 / bs))
|
||||||
|
paddle train --job=test \
|
||||||
|
--config="${topology}.py" \
|
||||||
|
--use_mkldnn=False \
|
||||||
|
--use_gpu=False \
|
||||||
|
--trainer_count=$thread \
|
||||||
|
--log_period=$log_period \
|
||||||
|
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
|
||||||
|
--init_model_path=$models_in \
|
||||||
|
2>&1 | tee ${log}
|
||||||
|
|
||||||
|
# calculate the last 5 logs period time of 160(=32*5) samples,
|
||||||
|
# the time before are burning time.
|
||||||
|
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||||
|
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||||
|
start_sec=`clock_to_seconds $start`
|
||||||
|
end_sec=`clock_to_seconds $end`
|
||||||
|
fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
|
||||||
|
echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
|
||||||
|
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "train.list" ]; then
|
||||||
|
echo " " > train.list
|
||||||
|
fi
|
||||||
|
if [ ! -f "test.list" ]; then
|
||||||
|
echo " " > test.list
|
||||||
|
fi
|
||||||
|
if [ ! -d "logs" ]; then
|
||||||
|
mkdir logs
|
||||||
|
fi
|
||||||
|
|
||||||
|
# inference benchmark
|
||||||
|
for batchsize in 1 2 4 8 16; do
|
||||||
|
infer vgg 19 $batchsize
|
||||||
|
infer resnet 50 $batchsize
|
||||||
|
infer googlenet v1 $batchsize
|
||||||
|
infer alexnet 2 $batchsize
|
||||||
|
done
|
@ -0,0 +1,41 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
function train() {
|
||||||
|
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||||
|
topology=$1
|
||||||
|
layer_num=$2
|
||||||
|
bs=$3
|
||||||
|
thread=`nproc`
|
||||||
|
# each trainer_count use only 1 core to avoid conflict
|
||||||
|
log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||||
|
args="batch_size=${bs},layer_num=${layer_num}"
|
||||||
|
config="${topology}.py"
|
||||||
|
paddle train --job=time \
|
||||||
|
--config=$config \
|
||||||
|
--use_mkldnn=False \
|
||||||
|
--use_gpu=False \
|
||||||
|
--trainer_count=$thread \
|
||||||
|
--log_period=3 \
|
||||||
|
--test_period=30 \
|
||||||
|
--config_args=$args \
|
||||||
|
2>&1 | tee ${log}
|
||||||
|
|
||||||
|
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
|
||||||
|
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
|
||||||
|
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "train.list" ]; then
|
||||||
|
echo " " > train.list
|
||||||
|
fi
|
||||||
|
if [ ! -d "logs" ]; then
|
||||||
|
mkdir logs
|
||||||
|
fi
|
||||||
|
|
||||||
|
# training benchmark
|
||||||
|
for batchsize in 64 128 256; do
|
||||||
|
train vgg 19 $batchsize
|
||||||
|
train resnet 50 $batchsize
|
||||||
|
train googlenet v1 $batchsize
|
||||||
|
train alexnet 2 $batchsize
|
||||||
|
done
|
File diff suppressed because it is too large
Load Diff
@ -1,23 +1,29 @@
|
|||||||
# Executor Design Doc
|
# Executor Design Doc
|
||||||
|
|
||||||
## Motivation
|
## Motivation
|
||||||
|
In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
|
||||||
|
[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
|
||||||
|
|
||||||
We use executor to do the runtime evaluation of a `ProgramDesc`.
|
The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
|
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
|
||||||
|
|
||||||
### What does executor do?
|
## Executor
|
||||||
|
|
||||||
It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
|
The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
|
||||||
|
It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
|
||||||
|
|
||||||
### What does executor NOT do?
|
### The interface
|
||||||
|
```c++
|
||||||
|
Executor(places);
|
||||||
|
```
|
||||||
|
A executor does not own any computing resources, a user can only construct an executor using the specified places.
|
||||||
|
|
||||||
It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
|
### Running an Executor
|
||||||
|
|
||||||
It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
|
```
|
||||||
|
void Run(ProgramDesc, Scope, block_id, create_local_scope);
|
||||||
## Implementation
|
```
|
||||||
|
An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
|
||||||
`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
|
|
||||||
|
Binary file not shown.
After Width: | Height: | Size: 108 KiB |
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue