Merge branch 'Pdv' into dis_ckpt_fix

revert-12864-feature/process_lod_grad
tangwei12 7 years ago
commit 438de1e03b

@ -136,6 +136,12 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
if(WITH_MKL)
option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
if (MKL_SPLIT_GEMM)
add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
endif()
endif()
set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN)
if (WITH_MKL AND AVX2_FOUND)
@ -282,7 +288,3 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
if (WITH_CONTRIB)
add_subdirectory(paddle/contrib)
endif()

@ -18,7 +18,21 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
### Install Latest Stable Release:
```
# Linux CPU
pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==0.14.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==0.14.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
## Features

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function train() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function clock_to_seconds() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function train() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function clock_to_seconds() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function train() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function train() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function test() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function test() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function test() {

@ -1,3 +1,5 @@
#!/bin/bash
set -e
function test() {

@ -138,25 +138,24 @@ copy(memory_lib
set(inference_deps paddle_fluid_shared paddle_fluid)
if(WITH_CONTRIB)
message(STATUS "installing contrib")
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
if (WITH_ANAKIN AND WITH_GPU)
copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
list(APPEND inference_deps contrib_anakin_inference_lib)
endif()
copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
DSTS ${contrib_dst_dir} ${contrib_dst_dir})
list(APPEND inference_deps contrib_inference_lib)
set(module "inference/api")
if (WITH_ANAKIN AND WITH_GPU)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
list(APPEND inference_deps anakin_inference_lib)
endif()
copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${src_dir}/${module}/paddle_inference_api.h
${src_dir}/${module}/demo_ci
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
)
list(APPEND inference_deps inference_api_lib)
set(module "inference")
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*

@ -4,25 +4,43 @@ set(tmp_version "HEAD")
set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
while ("${PADDLE_VERSION}" STREQUAL "")
# Check current branch name
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_NAME
RESULT_VARIABLE GIT_RESULT
OUTPUT_VARIABLE GIT_BRANCH_NAME
RESULT_VARIABLE GIT_BRANCH_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_RESULT})
# Check the tag is a correct version
if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
# if no tag was found, set PADDLE_VERSION to latest
set(PADDLE_VERSION "latest")
elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
else() # otherwise, get the previous git tag name.
set(tmp_version "${GIT_TAG_NAME}~1")
if (NOT ${GIT_BRANCH_RESULT})
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_NAME
RESULT_VARIABLE GIT_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_RESULT})
# Check if current branch is release branch
if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
# Check the tag is a correct version
if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
# if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0")
elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
else() # otherwise, get the previous git tag name.
set(tmp_version "${GIT_TAG_NAME}~1")
endif()
else()
# otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0")
endif()
else()
set(PADDLE_VERSION "0.0.0")
message(WARNING "Cannot add paddle version from git tag")
endif()
else()
set(PADDLE_VERSION "0.0.0")
message(WARNING "Cannot add paddle version from git tag")
message(WARNING "Cannot add paddle version for wrong git branch result")
endif()
endwhile()

@ -1768,3 +1768,11 @@ reverse
.. autofunction:: paddle.fluid.layers.reverse
:noindex:
.. _api_fluid_layers_rank_loss:
rank_loss
-------
.. autofunction:: paddle.fluid.layers.rank_loss
:noindex:

@ -0,0 +1,89 @@
## Motivation
There is a ```gap``` between the ```Program``` defined by
user and the ```Executable``` that can be scheduled
efficiently on heterogeneous hardware, either locally
or distributedly.
Usually, the ```gap``` is bridged by
* A serious transformations with defined order.
* These transformations usually involve
```insert, delete, clustering, split, dependency analysis```.
* Has a simple way to verify and debug each transformation.
* Flexible to add, remove or customize transformations to fit
the requirements of various algorithms (models) and hardware secenarios.
Some other events also push us to a better unified pattern.
* The deep learning framework is built around the concepts of graphs.
To leverage tools such as compilation (e.g. TVM and nGraph) or
cross-framework conversion (e.g. ONNX), we also need a intermediate
representation that can be connected to the rest of the ecosystem.
We need a unified pattern to naturally support the requirements
described above. The pattern should fit both training, inference
and other offline serielized model transformations.
Learned from LLVM and other deep learning framework, we draft the
design below.
## Design
### Major Concepts
#### Node
```Node``` represents an operation that performs some computation or
a variable that is input or output of operation.
```Node```s are connected to other ```Node```s via inputs and outputs.
Other properties (maybe device placement information) can be added
to ```Node``` in the future if it's a
common requirement of many other ```Pass```es. Otherwise, it should live
in a ```Node``` wrapper class that is private to some ```Pass``` or be
a local member of a ```Pass```.
#### Graph
```Graph``` contains a list of ```Node```s, which are connected to
each other via inputs and outputs.
TODO: Better definitions for the graph.
```Graph``` can also contain ```Attribute```s. ```Attribute```s
can be ``any`` thing. For example, it can be a list of "wraper"
nodes. The ```wrapper``` nodes compose ```Node```s and provide
helper method for execution or transformation. ```Attribute```
can also contain other things that describe some properties of
the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
across ```Pass```. However, it should be used with care.
#### Pass
```Pass``` represents a transformation of ```Graph```. Its input
is a ```Graph``` and its output is also a ```Graph```. For example,
a ```Pass``` can simply print out the ```Graph```. A ```Pass```
can also fuse some ```Graph```'s ```Node```s.
#### Optimize
```Optimize``` contains a series of ```Pass``` with defined order.
```Optimize``` transforms a ```Graph``` that only contains raw
modeling logic to a ```Graph``` that can be run efficiently while
maintaining the original modeling logic.
### Optimize Process
* Program is first converted to Graph.
* Graph goes through a series of Pass
* Graph is transformed from raw model logic to a
form that is efficient to execute.
Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor

@ -7,13 +7,13 @@
====================== ========================================
版本说明 C++预测库
====================== ========================================
cpu_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_
cpu_avx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda9.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
cpu_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
cpu_avx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
cuda9.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
====================== ========================================
从源码编译

@ -1,12 +1,16 @@
PaddlePaddle Fluid
==========================
.. PaddlePaddle Fluid documentation master file, created by
sphinx-quickstart on Thu Jun 7 17:04:53 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
##############
欢迎使用 Fluid
##############
.. toctree::
:maxdepth: 1
:maxdepth: 1
getstarted/index_cn.rst
build_and_install/index_cn.rst
design/index_cn.rst
howto/index_cn.rst
dev/index_cn.rst
faq/index_cn.rst
new_docs/beginners_guide/index.rst
new_docs/user_guides/index.rst
new_docs/advanced_usage/index.rst
new_docs/faq/index_cn.rst

@ -0,0 +1,120 @@
#################
如何进行基准测试
#################
本文介绍如何给深度学习框架做基准测试。基准测试主要包含验证模型的精度和性能两方面,下文包含搭建测试环境,选择基准测试模型,验证测试结果等几方面内容。
验证深度学习框架,可分为训练和测试两个阶段, 验证指标略有不同本文只介绍训练阶段的指标验证。训练阶段关注的是模型训练集上的精度训练集是完备的因此关注大batch\_size下的训练速度,关注吞吐量例如图像模型常用的batch\_size=128, 多卡情况下会加大预测阶段关注的是在测试集上的精度线上服务测试数据不能提前收集因此关注小batch\_size下的预测速度关注延迟例如预测服务常用的batch\_size=1, 4等。
`Fluid <https://github.com/PaddlePaddle/Paddle>`__ 是PaddlePaddle从0.11.0版本开始引入的设计,本文的基准测试在该版本上完成。
环境搭建
""""""""""""
基准测试中模型精度和硬件、框架无关,由模型结构和数据共同决定;性能方面由测试硬件和框架性能决定。框架基准测试为了对比框架之间的差异,控制硬件环境,系统库等版本一致。下文中的对比实验都在相同的硬件条件和系统环境条件下进行.
不同架构的GPU卡性能差异巨大在验证模型在GPU上训练性能时可使用NVIDIA提供的工具:code `nvidia-smi` 检验当前使用的GPU型号如果测试多卡训练性能需确认硬件连接是 `nvlink <https://zh.wikipedia.org/zh/NVLink>`__`PCIe <https://zh.wikipedia.org/zh-hans/PCI_Express>`__ 。 同样地CPU型号会极大影响模型在CPU上的训练性能。可读取`/proc/cpuinfo`中的参数确认当前正在使用的CPU型号。
下载GPU对应的Cuda Tool Kit和 Cudnn或者使用NVIDIA官方发布的nvidia-docker镜像 `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`__, 镜像内包含了Cuda和Cudnn本文采用这种方式。 Cuda Tool Kit包含了GPU代码使用到的基础库影响在此基础上编译出的Fluid二进制运行性能。
准备好Cuda环境后从github上的下载Paddle并源码编译会生成对应的最适合当前GPU的sm\_arch二进制\ `sm\_arch <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`__\ 。另外cudnn对卷积类任务影响巨大在基准测试中需要小版本一致例如Cudnn7.0.2与Cudnn7.1.4在Resnet上有5%以上差异。
选择基准模型
""""""""""""
对框架做基准测试需要覆盖不同训练任务和不同大小的模型本文中选取了图像和NLP的最为常用的5个模型。
============ ============ ================= ============
任务种类 模型名称 网络结构 数据集
============ ============ ================= ============
图像分类 mnist Lenet mnist
图像分类 VGG VGG-16 Flowers102
图像分类 Resnet Resnet-50 Flowers102
文本分类 Stacked-LSTM Stacked-LSTM IMDB
机器翻译 seq-seq Stacked-LSTM wmt14
============ ============ ================= ============
其中mnist, VGG, Resnet属于CNN模型, stacked-lstm, seq2seq代表RNN模型。
`benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__
基准模型测试脚本中均跳过了前几个batch的训练过程原因是加载数据和分配显存受系统当前运行情况影响会导致统计性能不准确。运行完若干个轮次后统计对应指标。
基准模型的数据的选择方面数据量大且验证效果多的公开数据集为首选。图像模型VGG和resnet, 本文选择了 `flowers102 <http://www.robots.ox.ac.uk/~vgg/data/flowers/102/>`__ 图像大小预处理为和Imagenet相同大小因此性能可直接对比
NLP模型的公开且影响力大数据集较少seq2seq模型选择了wmt14数据stacked-lstm模型中选择了 `imdb <https://www.imdb.com/interfaces/>`__ 数据。
注意图像模型每条样本大小相同图像经过变换后大小一致因此经过的计算路径基本相同计算速度和显存占用波动较小可以从若干个batch的数据中采样得到当前的训练性能数据。而NLP模型由于样本长度不定计算路径和显存占用也不相同因此只能完整运行若干个轮次后统计速度和显存消耗。
显存分配是特别耗时的操作因此Fluid默认会占用所有可用显存空间形成显存池用以加速计算过程中的显存分配。如果需要统计模型真实显存消耗可设置环境变量`FLAGS_fraction_of_gpu_memory_to_use=0.0`,观察最大显存开销。
测试过程
""""""""""""
- CPU 单机单线程测试
测试CPU上单线程的性能先设置CUDA的环境变量为空``CUDA_VISIBLE_DEVICES=``并通过环境变量关闭OpenMP和MKL的多线程 ``OMP_NUM_THREADS=1`` ``MKL_NUM_THREADS=1;``
然后代码中设置为使用CPUPlace如果使用Paddle代码库中的脚本只需要命令行参数传入 use_gpu=False即可。
.. code-block:: python
>>> import paddle.fluid as fluid
>>> place = fluid.CPUPlace()
.. code:: bash
docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark paddlepaddle/paddle:latest-dev /bin/bash
- GPU 单机单卡测试
本教程使用了Cuda8, Cudnn7.0.1。来源为:code `nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
.. code:: bash
nvidia-docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu paddlepaddle/paddle:latest-dev /bin/bash
在单卡上测试设置CUDA的环境变量使用一块GPU``CUDA_VISIBLE_DEVICES=0``
然后代码中设置为使用CUDAPlace如果使用Paddle代码库中的脚本只需要命令行参数传入 use_gpu=True即可。
.. code-block:: python
>>> import paddle.fluid as fluid
>>> place = fluid.CUDAPlace(0) // 0 指第0块GPU
测试结果
""""""""""""
本教程对比相同环境下的Fluid0.12.0和TensorFlow1.4.0的性能表现。
硬件环境为 CPU: Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz, GPU: TITAN X(Pascal) 12G x 1, Nvidia-Driver 384.90。
系统环境为Ubuntu 16.04.3 LTS, 本文中采用了docker环境系统版本为nvidia-docker17.05.0-ce。
测试的Fluid版本为\ `v.0.12.0 <https://github.com/PaddlePaddle/Paddle/releases/tag/v.0.12.0>`__
TensorFlow版本为\ `v.1.4.0-rc1 <https://github.com/tensorflow/tensorflow/tree/v1.4.0-rc1>`__
使用的脚本和配置见\ `benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__
图表中统计单位为samples/秒。
- CPU 单机单线程测试结果
================ ==================== ===================
Speed Fluid CPU TensorFlow CPU
================ ==================== ===================
mnist 1298.75 samples/s 637.57 samples/s
VGG-16 0.4147 images/s 0.1229 images/s
Resnet-50 1.6935 images/s 0.3657 images/s
Stacked-LSTM 472.3225 words/s 48.2293words/s
Seq2Seq 217.1655 words/s 28.6164 words/s
================ ==================== ===================
- GPU 单机单卡测试结果
=============== ===================== =================
Speed Fluid GPU TensorFlow GPU
=============== ===================== =================
mnist 19710.90 samples/s 15576.3 samples/s
VGG-16 59.83327 images/s 40.9967 images/s
Resnet-50 105.84412 97.8923 images/s
Stacked-LSTM 1319.99315 1608.2526 words/s
Seq2Seq 7147.89081 6845.1161 words/s
=============== ===================== =================

@ -0,0 +1,56 @@
# Anakin ARM 性能测试
## 测试环境和参数:
+ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
+ 采用android ndk交叉编译gcc 4.9enable neon ABI armveabi-v7a with neon -mfloat-abi=softfp
+ 测试平台
- 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
- nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
- 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
+ 多线程openmp
+ 时间warmup10次运行10次取均值
+ ncnn版本来源于github的master branch中commits ID307a77f04be29875f40d337cfff6df747df09de6msg:convert LogisticRegressionOutput)版本
+ TFlite版本来源于github的master branch中commits ID65c05bc2ac19f51f7027e66350bc71652662125cmsg:Removed unneeded file copy that was causing failure in Pi builds)版本
在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
## BenchMark model
> 注意在性能测试之前请先将测试model通过[External Converter](#10003)转换为Anakin model
> 对这些model本文在ARM上进行多线程的单batch size测试。
- [Mobilenet v1](#11) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
- [Mobilenet v2](#22) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
- [mobilenet-ssd](#33) *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
### <span id = '11'> mobilenetv1 </span>
|platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
|:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
|麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
|高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
|高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan|
### <span id = '22'> mobilenetv2 </span>
|platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
|:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
|麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
|高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
|高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
### <span id = '33'> mobilenet-ssd </span>
|platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
|:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
|麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
|高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
|高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
## How to run those Benchmark models?
1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换
2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
4. 最后,终端显示器上将会打印该模型的运行时间
5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到

@ -0,0 +1,28 @@
# Example
Anakin目前只支持NCHW的格式
示例文件在test/framework/net下
## 在NV的GPU上运行CNN模型
示例文件为打开example_nv_cnn_net.cpp,整体流程如下:
- 将模型的的path设置为anakin模型的路径初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到
- 根据模型设置网络图的输入尺寸,进行图优化
- 根据优化后的网络图初始化网络执行器
- 取出网络的输入tensor将数据拷贝到输入tensor
- 运行推导
- 取出网络的输出tensor
以NV平台为例演示Anakin框架的使用方法注意编译时需要打开GPU编译开关
## 在X86上运行RNN模型
示例文件为example_x86_rnn_net.cpp
整体流程与在NV的GPU上运行CNN模型相似不同之处如下:
- 使用X86标识初始化图对象和网络执行器对象
- rnn模型的输入尺寸是可变的初始化图时的输入维度是维度的最大值,输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词其中第0到第4个词是第一句话第5到第11个词是第二句话
X86平台为例演示Anakin框架的使用方法注意编译时需要打开X86编译开关
## 在NV的GPU上使用Anakin的线程池运行CNN模型
示例文件为example_nv_cnn_net_multi_thread.cpp 示例使用worker的同步预测接口
整体流程与在NV的GPU上运行CNN模型相似不同之处如下:
- 用模型地址和线程池大小初始化worker对象
- 将输入tensor注入任务队列,获得输出tensor

@ -0,0 +1,170 @@
# Anakin GPU Benchmark
## Machine:
> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
> GPU: `Tesla P4`
> cuDNN: `v7`
## Counterpart of anakin :
The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support.
## Benchmark Model
The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`.
You can use pretrained caffe model or the model trained by youself.
> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
- [RNN](#7) *not support yet*
We tested them on single-GPU with single-thread.
### <span id = '1'>VGG16 </span>
- Latency (`ms`) of different batch
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 8.8690 | 8.2815 |
| 2 | 15.5344 | 13.9116 |
| 4 | 26.6000 | 21.8747 |
| 8 | 49.8279 | 40.4076 |
| 32 | 188.6270 | 163.7660 |
- GPU Memory Used (`MB`)
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 963 | 997 |
| 2 | 965 | 1039 |
| 4 | 991 | 1115 |
| 8 | 1067 | 1269 |
| 32 | 1715 | 2193 |
### <span id = '2'>Yolo </span>
- Latency (`ms`) of different batch
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 16.4596| 15.2124 |
| 2 | 26.6347| 25.0442 |
| 4 | 43.3695| 43.5017 |
| 8 | 80.9139 | 80.9880 |
| 32 | 293.8080| 310.8810 |
- GPU Memory Used (`MB`)
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 1569 | 1775 |
| 2 | 1649 | 1815 |
| 4 | 1709 | 1887 |
| 8 | 1731 | 2031 |
| 32 | 2253 | 2907 |
### <span id = '3'> Resnet50 </span>
- Latency (`ms`) of different batch
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 4.2459 | 4.1061 |
| 2 | 6.2627 | 6.5159 |
| 4 | 10.1277 | 11.3327 |
| 8 | 17.8209 | 20.6680 |
| 32 | 65.8582 | 77.8858 |
- GPU Memory Used (`MB`)
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 531 | 503 |
| 2 | 543 | 517 |
| 4 | 583 | 541 |
| 8 | 611 | 589 |
| 32 | 809 | 879 |
### <span id = '4'> Resnet101 </span>
- Latency (`ms`) of different batch
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 7.5562 | 7.0837 |
| 2 | 11.6023 | 11.4079 |
| 4 | 18.3650 | 20.0493 |
| 8 | 32.7632 | 36.0648 |
| 32 | 123.2550 | 135.4880 |
- GPU Memory Used (`MB)`
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 701 | 683 |
| 2 | 713 | 697 |
| 4 | 793 | 721 |
| 8 | 819 | 769 |
| 32 | 1043 | 1059 |
### <span id = '5'> MobileNet V1 </span>
- Latency (`ms`) of different batch
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 45.5156 | 1.3947 |
| 2 | 46.5585 | 2.5483 |
| 4 | 48.4242 | 4.3404 |
| 8 | 52.7957 | 8.1513 |
| 32 | 83.2519 | 31.3178 |
- GPU Memory Used (`MB`)
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 329 | 283 |
| 2 | 345 | 289 |
| 4 | 371 | 299 |
| 8 | 393 | 319 |
| 32 | 531 | 433 |
### <span id = '6'> MobileNet V2</span>
- Latency (`ms`) of different batch
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 65.6861 | 2.9842 |
| 2 | 66.6814 | 4.7472 |
| 4 | 69.7114 | 7.4163 |
| 8 | 76.1092 | 12.8779 |
| 32 | 124.9810 | 47.2142 |
- GPU Memory Used (`MB`)
| BatchSize | TensorRT | Anakin |
| --- | --- | --- |
| 1 | 341 | 293 |
| 2 | 353 | 301 |
| 4 | 385 | 319 |
| 8 | 421 | 351 |
| 32 | 637 | 551 |
## How to run those Benchmark models?
> 1. At first, you should parse the caffe model with [`external converter`](https://github.com/PaddlePaddle/Anakin/blob/b95f31e19993a192e7428b4fcf852b9fe9860e5f/docs/Manual/Converter_en.md).
> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file.
> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen.
> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in model log file.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,99 @@
.. _install_or_build_cpp_inference_lib:
安装与编译C++预测库
===========================
直接下载安装
-------------
====================== ========================================
版本说明 C++预测库
====================== ========================================
cpu_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_
cpu_avx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda9.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
====================== ========================================
从源码编译
----------
用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项:
================= =========
选项 值
================= =========
CMAKE_BUILD_TYPE Release
FLUID_INSTALL_DIR 安装路径
WITH_FLUID_ONLY ON推荐
WITH_SWIG_PY OFF推荐
WITH_PYTHON OFF推荐
WITH_GPU ON/OFF
WITH_MKL ON/OFF
================= =========
建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
下面的代码片段从github拉取最新代码配制编译选项需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径
.. code-block:: bash
pip install paddlepaddle-gpu
PADDLE_ROOT=/path/of/capi
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir build
cd build
cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_FLUID_ONLY=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_PYTHON=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=OFF \
..
make
make inference_lib_dist
成功编译后使用C++预测库所需的依赖包括1编译出的PaddlePaddle预测库和头文件2第三方链接库和头文件3版本信息与编译选项信息
均会存放于PADDLE_ROOT目录中。目录结构如下
.. code-block:: text
PaddleRoot/
├── CMakeCache.txt
├── paddle
│   └── fluid
│   ├── framework
│   ├── inference
│   ├── memory
│   ├── platform
│   ├── pybind
│   └── string
├── third_party
│   ├── boost
│   │   └── boost
│   ├── eigen3
│   │   ├── Eigen
│   │   └── unsupported
│   └── install
│   ├── gflags
│   ├── glog
│   ├── mklml
│   ├── protobuf
│   ├── snappy
│   ├── snappystream
│   └── zlib
└── version.txt
version.txt 中记录了该预测库的版本信息包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号
.. code-block:: text
GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
WITH_MKL: ON
WITH_GPU: ON
CUDA version: 8.0
CUDNN version: v5

@ -0,0 +1,73 @@
# 模型转换指南
Anakin 支持不同框架的模型预测。但由于格式的差别Anakin 需要您预先转换模型。本文档介绍如何转换模型。
## 简介
Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型模型包含网络结构model 或 prototxt和权重参数param 或 caffemodel
模型转换的输出是一个 bin 文件,它作为 Anakin 框架的 graph 参数导入。
您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。
## 系统要求
- python 2.7+
- pyyaml
- flask
- protobuf 3.5+
## 用法
### 1、环境
转换器所需的依赖标注于 *系统要求* 一节。
### 2、配置
您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例,下面作进一步说明。
#### config.yaml
```bash
OPTIONS:
Framework: CAFFE # 依框架类型填写 CAFFE 或 FLUID
SavePath: ./output # 转换结束后模型的保存位置
ResultName: googlenet # 输出模型的名字
Config:
LaunchBoard: ON # 是否生成网络结构预览页面
Server:
ip: 0.0.0.0
port: 8888 # 从一个可用端口访问预览页面
OptimizedGraph: # 当您使用了 Anakin 框架的 Optimized 功能时,才应该打开此项
enable: OFF
path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
LOGGER:
LogToPath: ./log/ # 生成日志的路径
WithColor: ON
TARGET:
CAFFE:
# 当 Framework 为 CAFFE 时需填写
ProtoPaths:
- /path/to/caffe/src/caffe/proto/caffe.proto
PrototxtPath: /path/to/your/googlenet.prototxt
ModelPath: /path/to/your/googlenet.caffemodel
FLUID:
# 当 Framework 为 FLUID 时需填写
Debug: NULL
ProtoPaths:
- /
PrototxtPath: /path/to/fluid/inference_model
ModelPath: /path/to/fluid/inference_model
# ...
```
### 3、转换
在完成配置文件的修改后,您只需执行 ```python converter.py``` 就可以进行模型转换了。
### 4、预览
最后一步,就是在浏览器中查看令人振奋的转换结果!网址是在 *config.yaml* 中配置的,例如 http://0.0.0.0:8888 。
> 注意:若您使用了默认的 IP 地址 0.0.0.0,请在预览时使用真实的服务器地址 real_ip:port 替代它。

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save