Merge branch 'develop' of github.com:PaddlePaddle/Paddle into dist_recordio

wangkuiyi-patch-1
Yancey1989 7 years ago
commit 8855d4a755

@ -4,6 +4,7 @@
| backyes | Yan-Fei Wang | | backyes | Yan-Fei Wang |
| baiyfbupt | Yi-Fan Bai | | baiyfbupt | Yi-Fan Bai |
| beckett1124 | Bin Qi | | beckett1124 | Bin Qi |
| ChengduoZH | Cheng-Duo Zhao|
| chengxiaohua1105 | Xiao-Hua Cheng | | chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng | | cxysteven | Xing-Yi Cheng |

@ -29,7 +29,7 @@ RUN apt-get update && \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig cmake \
liblapack-dev liblapacke-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool ccache && \ net-tools libtool ccache && \

@ -0,0 +1,22 @@
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
RUN pip install -U pip
RUN pip install -U kubernetes opencv-python paddlepaddle
# IMPORTANT:
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py dataset.py models/ /workspace/

@ -44,11 +44,25 @@ Currently supported `--model` argument include:
## Run Distributed Benchmark on Kubernetes Cluster ## Run Distributed Benchmark on Kubernetes Cluster
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
have to start all those processes mannually on each node, which is not recommended.
To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
download it from
http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
build it by your own. Once you've got the "whl" package, put it under the current directory and run:
```bash
docker build -t [your docker image name]:[your docker image tag] .
```
Then push the image to a Docker registry that your Kubernetes cluster can reach.
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
distributed benchmark jobs to your cluster. To generate a job yaml, just run: distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash ```bash
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
``` ```
Then the yaml files are generated under directory `myjob`, you can run: Then the yaml files are generated under directory `myjob`, you can run:

@ -49,7 +49,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--fluid', default=1, type=int, help='whether is fluid job') '--fluid', default=1, type=int, help='whether is fluid job')
parser.add_argument( parser.add_argument(
'--rdma', action='store_ture', help='whether mount rdma libs') '--rdma', action='store_true', help='whether mount rdma libs')
parser.add_argument( parser.add_argument(
'--disttype', '--disttype',
default="pserver", default="pserver",

@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \
-l 1 & -l 1 &
# mnist # mnist
# mnist gpu mnist 128 # mnist gpu mnist 128
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=mnist \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--skip_batch_num=5 \ --skip_batch_num=5 \
@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
# vgg16 # vgg16
# gpu cifar10 128 # gpu cifar10 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=vgg16 \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--skip_batch_num=5 \ --skip_batch_num=5 \
@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
2>&1 | tee -a vgg16_gpu_128.log 2>&1 | tee -a vgg16_gpu_128.log
# flowers gpu 128 # flowers gpu 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=vgg16 \
--device=GPU \ --device=GPU \
--batch_size=32 \ --batch_size=32 \
--data_set=flowers \ --data_set=flowers \
@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
# resnet50 # resnet50
# resnet50 gpu cifar10 128 # resnet50 gpu cifar10 128
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=resnet50 \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--data_set=cifar10 \ --data_set=cifar10 \
--model=resnet_cifar10 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 | tee -a resnet50_gpu_128.log 2>&1 | tee -a resnet50_gpu_128.log
# resnet50 gpu flowers 64 # resnet50 gpu flowers 64
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=resnet50 \
--device=GPU \ --device=GPU \
--batch_size=64 \ --batch_size=64 \
--data_set=flowers \ --data_set=flowers \
--model=resnet_imagenet \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 | tee -a resnet50_gpu_flowers_64.log 2>&1 | tee -a resnet50_gpu_flowers_64.log
# lstm # lstm
# lstm gpu imdb 32 # tensorflow only support batch=32 # lstm gpu imdb 32 # tensorflow only support batch=32
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=stacked_dynamic_lstm \
--device=GPU \ --device=GPU \
--batch_size=32 \ --batch_size=32 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
--hidden_dim=512 \
--emb_dim=512 \
--crop_size=1500 \
2>&1 | tee -a lstm_gpu_32.log 2>&1 | tee -a lstm_gpu_32.log
# seq2seq # seq2seq
# seq2seq gpu wmb 128 # seq2seq gpu wmb 128
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=machine_translation \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--skip_batch_num=5 \ --skip_batch_num=5 \

@ -1009,3 +1009,9 @@ ____
.. autofunction:: paddle.fluid.layers.upsampling_bilinear2d .. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
:noindex: :noindex:
gather
____
.. autofunction:: paddle.fluid.layers.gather
:noindex:

@ -86,7 +86,7 @@
<br> <br>
<p align="center"> <p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_compiler.png" width=100%> <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
</p> </p>
--- ---
@ -123,12 +123,12 @@
<font size=5> <font size=5>
- 在科学计算领域,计算图是一种描述计算的经典方式。下图展示了从前向计算图(蓝色)开始,通过添加反向(红色)和优化算法相关(绿色)操作,构建出整个计算图的过程: - 在科学计算领域,计算图是一种描述计算的经典方式。下图展示了从前向计算图(蓝色)开始,通过添加反向(红色)和优化算法相关(绿色)操作,构建出整个计算图的过程:
- -
<p align="center"> <p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%> <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
</p> </p>
- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成,相关概念会在后文详细展开。 - Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成,相关概念会在后文详细展开。
- 编译时 Fluid 接受前向计算(这里可以先简单的理解为是一段有序的计算流)`Program`,为这段前向计算按照:前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序,添加相关 `Operator`和`Variable`到`Program`到完整的计算。 - 编译时 Fluid 接受前向计算(这里可以先简单的理解为是一段有序的计算流)`Program`,为这段前向计算按照:前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序,添加相关 `Operator`和`Variable`到`Program`到完整的计算。
@ -328,7 +328,7 @@
</font> </font>
--- ---
### 编译时概念 ==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**== ### 编译时概念 ==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
<font size=5> <font size=5>
@ -402,7 +402,7 @@
- `Scope` - `Scope`
- 计算相关 - 计算相关
- `Block` - `Block`
- `Kernel`、`OpWithKernel`、`OpWithoutKernel` - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
<table> <table>
@ -439,7 +439,7 @@
</tbody> </tbody>
</table> </table>
- 执行相关 `Executor` - 执行相关 `Executor`
</font> </font>
@ -798,7 +798,7 @@ class GPUAllocator : public SystemAllocator {
- step 1添加Place类型<span style="background-color:#DAB1D5;">由用户实现添加到框架</span> - step 1添加Place类型<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
- 可以将Place类型理解为一个整数加上一个枚举型包括设备号 + 设备类型 - 可以将Place类型理解为一个整数加上一个枚举型包括设备号 + 设备类型
<p align="center"> <p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%> <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
</p> </p>
@ -824,7 +824,7 @@ class GPUAllocator : public SystemAllocator {
1. DataType 执行数据类型 FP32/FP64/INT32/INT64 1. DataType 执行数据类型 FP32/FP64/INT32/INT64
1. Memory layout 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC 1. Memory layout 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
1. 使用的库 1. 使用的库
来区分Kernel为同一个operator注册多个 Kernel。 来区分Kernel为同一个operator注册多个 Kernel。
```cpp ```cpp
@ -876,7 +876,7 @@ step 3: 运行时的 KernelType 推断和Kernel切换<span style="background-
namespace framework { namespace framework {
using LoDTensorArray = std::vector<LoDTensor>; using LoDTensorArray = std::vector<LoDTensor>;
} }
} }
``` ```
- 每一次循环,从原始输入中“切出”一个片段 - 每一次循环,从原始输入中“切出”一个片段
- LoDTensorArray 在Python端暴露是Fluid支持的基础数据结构之一用户可以直接创建并使用 - LoDTensorArray 在Python端暴露是Fluid支持的基础数据结构之一用户可以直接创建并使用
@ -910,7 +910,7 @@ void Run(const framework::Scope &scope,
false /*create_local_scope*/); false /*create_local_scope*/);
} }
} }
``` ```
</font> </font>
@ -951,7 +951,7 @@ void Run(const framework::Scope &scope,
--- ---
#### dynamicRNN 中的 Memory #### dynamicRNN 中的 Memory
<font size=5> <font size=5>
@ -961,7 +961,7 @@ void Run(const framework::Scope &scope,
- `memory` 在 operator A 前向计算之后,进行前向计算 - `memory` 在 operator A 前向计算之后,进行前向计算
- 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
- `memory` 的输出可以是另一个 operator 的输入,于是形成了“循环”连接 - `memory` 的输出可以是另一个 operator 的输入,于是形成了“循环”连接
</font> </font>
--- ---
@ -1107,7 +1107,7 @@ void Run(const framework::Scope &scope,
<td> <td>
<p align="center"> <p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%> <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
</p> </p>
</td> </td>
<td> <td>
<p align="center"> <p align="center">
@ -1127,13 +1127,13 @@ void Run(const framework::Scope &scope,
<font size=5> <font size=5>
- 设计概览 - 设计概览
- 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md) - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
- fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md) - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
- fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md) - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
- 核心概念 - 核心概念
- variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md) - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
- Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md) - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
- LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
- TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
- Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md) - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
- Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
@ -1152,7 +1152,7 @@ void Run(const framework::Scope &scope,
- 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md) - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
- 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md) - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
- 添加新的Kernel [->]( - 添加新的Kernel [->](
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md) https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
</font> </font>
@ -1167,10 +1167,10 @@ https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_
<font size=5> <font size=5>
Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html) Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
PaddlePaddle 在 Dockerhub 地址:[->]( PaddlePaddle 在 Dockerhub 地址:[->](
https://hub.docker.com/r/paddlepaddle/paddle/tags/) https://hub.docker.com/r/paddlepaddle/paddle/tags/)
1. 获取PaddlePaddle的Docker镜像 1. 获取PaddlePaddle的Docker镜像
```bash ```bash
docker pull paddlepaddle/paddle:latest-dev docker pull paddlepaddle/paddle:latest-dev
@ -1183,7 +1183,7 @@ PaddlePaddle 在 Dockerhub 地址:[->](
``` ```
1. 进入docker container后从源码编译请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html) 1. 进入docker container后从源码编译请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
</font> </font>
--- ---
@ -1196,7 +1196,7 @@ PaddlePaddle 在 Dockerhub 地址:[->](
1. 开发推荐使用tag为`latest-dev`的镜像,其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像主要用于运行PaddlePaddle程序。 1. 开发推荐使用tag为`latest-dev`的镜像,其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像主要用于运行PaddlePaddle程序。
2. 在Docker中运行GPU程序推荐使用nvidia-docker[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。 2. 在Docker中运行GPU程序推荐使用nvidia-docker[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
<font size=4> <font size=4>
```bash ```bash
nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
``` ```
@ -1353,9 +1353,9 @@ Op注册实现在`.cc`文件Kernel注册CPU实现在`.cc`文件中CUDA实
} }
}; };
``` ```
</font> </font>
--- ---
###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类 ###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
@ -1420,11 +1420,11 @@ class ClipOp : public framework::OperatorWithKernel {
2. override InferShape函数参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24) 2. override InferShape函数参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)
1. 什么是`functor` ? 1. 什么是`functor` ?
- 类或结构体仅重载了`()`一般是可被多个kernel复用的计算函数。 - 类或结构体仅重载了`()`一般是可被多个kernel复用的计算函数。
<font size=4> <font size=4>
```cpp ```cpp
template <typename T> template <typename T>
class CrossEntropyFunctor<platform::CPUDeviceContext, T> { class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
@ -1438,9 +1438,9 @@ class ClipOp : public framework::OperatorWithKernel {
}; };
``` ```
</font> </font>
- 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。 - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
</font> </font>
--- ---
@ -1504,7 +1504,7 @@ class ClipKernel : public framework::OpKernel<T> {
- 需要注意,<span style="background-color:#e1c4c4;">Fluid中不区分Cost Op和中间层Op所有Op都必须正确处理接收到的梯度</span> - 需要注意,<span style="background-color:#e1c4c4;">Fluid中不区分Cost Op和中间层Op所有Op都必须正确处理接收到的梯度</span>
2. 反向Op的输出 2. 反向Op的输出
- 对可学习参数的求导结果 - 对可学习参数的求导结果
- 对所有输入的求导结果 - 对所有输入的求导结果
</font> </font>
@ -1520,7 +1520,7 @@ class ClipKernel : public framework::OpKernel<T> {
1. 在`.cc`文件中注册前向、反向Op类注册CPU Kernel。 1. 在`.cc`文件中注册前向、反向Op类注册CPU Kernel。
<font size=4> <font size=4>
```cpp ```cpp
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad, REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
@ -1530,13 +1530,13 @@ class ClipKernel : public framework::OpKernel<T> {
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>); clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
``` ```
- 在上面的代码片段中: - 在上面的代码片段中:
1. `REGISTER_OP` 注册`ops::ClipOp`类,类型名为`clip`,该类的`ProtoMaker`为`ops::ClipOpMaker`,注册`ops::ClipOpGrad`,类型名为`clip_grad` 1. `REGISTER_OP` 注册`ops::ClipOp`类,类型名为`clip`,该类的`ProtoMaker`为`ops::ClipOpMaker`,注册`ops::ClipOpGrad`,类型名为`clip_grad`
1. `REGISTER_OP_WITHOUT_GRADIENT` 用于注册没有反向的Op例如优化算法相关的Op 1. `REGISTER_OP_WITHOUT_GRADIENT` 用于注册没有反向的Op例如优化算法相关的Op
1. `REGISTER_OP_CPU_KERNEL` :注册`ops::ClipKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::ClipGradKernel`类 1. `REGISTER_OP_CPU_KERNEL` :注册`ops::ClipKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::ClipGradKernel`类
</font> </font>
1. 按照同样方法,在`.cu`文件中注册GPU Kernel 1. 按照同样方法,在`.cu`文件中注册GPU Kernel
- <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span> - <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
@ -1593,7 +1593,7 @@ class ClipKernel : public framework::OpKernel<T> {
```bash ```bash
make test ARGS="-R test_mul_op -V" make test ARGS="-R test_mul_op -V"
``` ```
或者: 或者:
``` ```
@ -1613,7 +1613,7 @@ class ClipKernel : public framework::OpKernel<T> {
- 如果多个Op依赖一些共用的函数可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 - 如果多个Op依赖一些共用的函数可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
</font> </font>
--- ---
### ==10.== 使用相关问题 ### ==10.== 使用相关问题
@ -1735,7 +1735,7 @@ class ClipKernel : public framework::OpKernel<T> {
y_data = np.random.randint(0, 8, [1]).astype("int32") y_data = np.random.randint(0, 8, [1]).astype("int32")
y_tensor = core.Tensor() y_tensor = core.Tensor()
y_tensor.set(y_data, place) y_tensor.set(y_data, place)
x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32") x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
x_tensor = core.Tensor() x_tensor = core.Tensor()
x_tensor.set(x_data, place) x_tensor.set(x_data, place)

@ -17,3 +17,4 @@
:maxdepth: 1 :maxdepth: 1
concepts/use_concepts_cn.rst concepts/use_concepts_cn.rst
developer's_guide_to_paddle_fluid.md

@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
:maxdepth: 1 :maxdepth: 1
concepts/index_en.rst concepts/index_en.rst
developer's_guide_to_paddle_fluid.md

@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装目前支持CentOS 6以上, Ubuntu 14.
pip install paddlepaddle pip install paddlepaddle
如果需要安装支持GPU的版本cuda7.5_cudnn5_avx_openblas需要执行 如果需要安装支持GPU的版本cuda8.0_cudnn5_avx_openblas需要执行
.. code-block:: bash .. code-block:: bash
@ -28,18 +28,18 @@ PaddlePaddle支持使用pip快速安装目前支持CentOS 6以上, Ubuntu 14.
import paddle.dataset.uci_housing as uci_housing import paddle.dataset.uci_housing as uci_housing
import paddle.fluid as fluid import paddle.fluid as fluid
with fluid.scope_guard(fluid.core.Scope()): with fluid.scope_guard(fluid.core.Scope()):
# initialize executor with cpu # initialize executor with cpu
exe = fluid.Executor(place=fluid.CPUPlace()) exe = fluid.Executor(place=fluid.CPUPlace())
# load inference model # load inference model
[inference_program, feed_target_names,fetch_targets] = \ [inference_program, feed_target_names,fetch_targets] = \
fluid.io.load_inference_model(uci_housing.fluid_model(), exe) fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
# run inference # run inference
result = exe.run(inference_program, result = exe.run(inference_program,
feed={feed_target_names[0]: uci_housing.predict_reader()}, feed={feed_target_names[0]: uci_housing.predict_reader()},
fetch_list=fetch_targets) fetch_list=fetch_targets)
# print predicted price is $12,273.97 # print predicted price is $12,273.97
print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000) print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。

@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
pip install paddlepaddle pip install paddlepaddle
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
.. code-block:: bash .. code-block:: bash
@ -31,18 +31,18 @@ code:
import paddle.dataset.uci_housing as uci_housing import paddle.dataset.uci_housing as uci_housing
import paddle.fluid as fluid import paddle.fluid as fluid
with fluid.scope_guard(fluid.core.Scope()): with fluid.scope_guard(fluid.core.Scope()):
# initialize executor with cpu # initialize executor with cpu
exe = fluid.Executor(place=fluid.CPUPlace()) exe = fluid.Executor(place=fluid.CPUPlace())
# load inference model # load inference model
[inference_program, feed_target_names,fetch_targets] = \ [inference_program, feed_target_names,fetch_targets] = \
fluid.io.load_inference_model(uci_housing.fluid_model(), exe) fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
# run inference # run inference
result = exe.run(inference_program, result = exe.run(inference_program,
feed={feed_target_names[0]: uci_housing.predict_reader()}, feed={feed_target_names[0]: uci_housing.predict_reader()},
fetch_list=fetch_targets) fetch_list=fetch_targets)
# print predicted price is $12,273.97 # print predicted price is $12,273.97
print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000) print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions Run :code:`python housing.py` and voila! It should print out a list of predictions

@ -4,5 +4,5 @@
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
inference/index_cn.rst
optimization/index_cn.rst optimization/index_cn.rst
inference/inference_support_in_fluid.md

@ -5,4 +5,3 @@ HOW TO
:maxdepth: 1 :maxdepth: 1
optimization/index_en.rst optimization/index_en.rst
inference/inference_support_in_fluid.md

@ -0,0 +1,96 @@
安装与编译C++预测库
===========================
直接下载安装
-------------
====================== ========================================
版本说明 C++预测库
====================== ========================================
cpu_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_
cpu_avx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
====================== ========================================
从源码编译
----------
用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项:
================= =========
选项 值
================= =========
CMAKE_BUILD_TYPE Release
FLUID_INSTALL_DIR 安装路径
WITH_FLUID_ONLY ON推荐
WITH_SWIG_PY OFF推荐
WITH_PYTHON OFF推荐
WITH_GPU ON/OFF
WITH_MKL ON/OFF
================= =========
建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
下面的代码片段从github拉取最新代码配制编译选项需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径
.. code-block:: bash
pip install paddlepaddle-gpu
PADDLE_ROOT=/path/of/capi
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir build
cd build
cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_FLUID_ONLY=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_PYTHON=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=OFF \
..
make
make inference_lib_dist
成功编译后使用C++预测库所需的依赖包括1编译出的PaddlePaddle预测库和头文件2第三方链接库和头文件3版本信息与编译选项信息
均会存放于PADDLE_ROOT目录中。目录结构如下
.. code-block:: text
PaddleRoot/
├── CMakeCache.txt
├── paddle
│   └── fluid
│   ├── framework
│   ├── inference
│   ├── memory
│   ├── platform
│   ├── pybind
│   └── string
├── third_party
│   ├── boost
│   │   └── boost
│   ├── eigen3
│   │   ├── Eigen
│   │   └── unsupported
│   └── install
│   ├── gflags
│   ├── glog
│   ├── mklml
│   ├── protobuf
│   ├── snappy
│   ├── snappystream
│   └── zlib
└── version.txt
version.txt 中记录了该预测库的版本信息包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号
.. code-block:: text
GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
WITH_MKL: ON
WITH_GPU: ON
CUDA version: 8.0
CUDNN version: v5

@ -0,0 +1,8 @@
预测库
------------
.. toctree::
:maxdepth: 1
build_and_install_lib_cn.rst
inference_support_in_fluid_cn.md

@ -1,9 +1,8 @@
# Fluid Inference使用指南 # 使用指南
## 目录: ## 目录:
- Python Inference API - Python Inference API
- 编译Fluid Inference库
- Inference C++ API - Inference C++ API
- Inference实例 - Inference实例
- Inference计算优化 - Inference计算优化
@ -55,62 +54,6 @@
return [program, feed_target_names, fetch_targets] return [program, feed_target_names, fetch_targets]
``` ```
## 编译Fluid Inference库
- **不需要额外的CMake选项**
- 1、 配置CMake命令更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
```bash
$ git clone https://github.com/PaddlePaddle/Paddle.git
$ cd Paddle
$ mkdir build
$ cd build
$ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_PYTHON=ON \
-DWITH_MKL=OFF \
-DWITH_GPU=OFF \
..
```
- 2、 编译PaddlePaddle
```bash
$ make
```
- 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
```bash
$ make inference_lib_dist
```
- 目录结构
```bash
$ cd your/path/to/paddle_inference_lib
$ tree
.
|-- paddle
| `-- fluid
| |-- framework
| |-- inference
| | |-- io.h
| | `-- libpaddle_fluid.so
| |-- memory
| |-- platform
| `-- string
|-- third_party
| |-- eigen3
| `-- install
| |-- gflags
| |-- glog
| `-- protobuf
`-- ...
```
假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
## 链接Fluid Inference库 ## 链接Fluid Inference库
- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git)) - 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))

@ -17,46 +17,33 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE) endif(APPLE)
function(inference_api_test TARGET_NAME TEST_SRC) function(inference_api_test TARGET_NAME)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs ARGS) set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "") cc_test(test_paddle_inference_${TARGET_NAME}
SRCS test_paddle_inference_${TARGET_NAME}.cc
DEPS paddle_fluid_api paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
if(inference_test_ARGS) if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS}) set_tests_properties(test_paddle_inference_${TARGET_NAME}
list(APPEND arg_list "_${arg}") PROPERTIES DEPENDS "${inference_test_ARGS}")
endforeach()
else()
list(APPEND arg_list "_")
endif() endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(${TARGET_NAME}
SRCS ${TEST_SRC}
DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
# TODO(panyx0178): Figure out how to add word2vec and image_classification
# as deps.
# set_tests_properties(${TARGET_NAME}
# PROPERTIES DEPENDS ${DEP_TEST})
endforeach()
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api cc_library(paddle_inference_api
SRCS paddle_inference_api.cc SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_impl if(WITH_TESTING)
SRCS paddle_inference_api_impl.cc cc_test(test_paddle_inference_api
DEPS paddle_inference_api paddle_fluid_api) SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api)
cc_test(test_paddle_inference_api inference_api_test(api_impl
SRCS test_paddle_inference_api.cc ARGS test_word2vec test_image_classification)
DEPS paddle_inference_api) endif()
inference_api_test(test_paddle_inference_api_impl
test_paddle_inference_api_impl.cc)

@ -40,15 +40,24 @@ struct PaddleBuf {
struct PaddleTensor { struct PaddleTensor {
std::string name; // variable name. std::string name; // variable name.
std::vector<int> shape; std::vector<int> shape;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
PaddleBuf data; // blob of data. PaddleBuf data; // blob of data.
PaddleDType dtype; PaddleDType dtype;
}; };
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
// TODO(Superjomn) support following engines latter.
// kAnakin, // Use Anakin for inference.
// kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
// kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
};
/* /*
* A simple Inference API for Paddle. Currently this API might just be used by * A simple Inference API for Paddle. Currently this API can be used by
* non-sequence scenerios. * non-sequence scenerios.
* TODO(Superjomn) Prepare another API for NLP-related usages. */
*/
class PaddlePredictor { class PaddlePredictor {
public: public:
struct Config; struct Config;
@ -66,34 +75,35 @@ class PaddlePredictor {
// be thread-safe. // be thread-safe.
virtual std::unique_ptr<PaddlePredictor> Clone() = 0; virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
virtual bool InitShared() { return false; }
// Destroy the Predictor. // Destroy the Predictor.
virtual ~PaddlePredictor() {} virtual ~PaddlePredictor() {}
friend std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
const PaddlePredictor::Config& config);
// The common configs for all the predictors. // The common configs for all the predictors.
struct Config { struct Config {
enum class EngineKind;
std::string model_dir; // path to the model directory. std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on bool enable_engine{false}; // Enable to execute (part of) the model on
// third-party engines.
EngineKind engine_kind{Config::EngineKind::kNone};
enum class EngineKind {
kNone = -1, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kTensorRT, // Use TensorRT for inference.
kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
};
}; };
}; };
// A factory to help create difference predictor. struct NativeConfig : public PaddlePredictor::Config {
template <typename ConfigT> // GPU related fields.
bool use_gpu{false};
int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
std::string prog_file;
std::string param_file;
};
// A factory to help create different predictors.
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type and engine kind. Similar
// configs can be merged, but there shouldn't be a huge config containing
// different fields for more than one kind of predictors.
//
// Similarly, each engine kind should map to a unique predictor implementation.
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle } // namespace paddle

@ -54,11 +54,10 @@ std::string num2str(T a) {
} }
} // namespace } // namespace
bool PaddlePredictorImpl::Init() { bool NativePaddlePredictor::Init() {
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
// TODO(panyx0718): Should CPU vs GPU device be decided by id? if (config_.use_gpu) {
if (config_.device >= 0) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
@ -85,19 +84,21 @@ bool PaddlePredictorImpl::Init() {
} }
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
// Create variables // Create temporary variables first, so that the first batch do not need to
// TODO(panyx0718): Why need to test share_variables here? // create variables in the runtime. This is the logics of the old inference
if (config_.share_variables) { // API.
executor_->CreateVariables(*inference_program_, scope_.get(), 0); // TODO(Superjomn) this should be modified when `Clone` is valid for
} // multi-thread application.
executor_->CreateVariables(*inference_program_, scope_.get(), 0);
// Get the feed_target_names and fetch_target_names // Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames(); feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames(); fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true; return true;
} }
bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs, bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) { std::vector<PaddleTensor> *output_data) {
VLOG(3) << "Predictor::predict"; VLOG(3) << "Predictor::predict";
Timer timer; Timer timer;
timer.tic(); timer.tic();
@ -124,7 +125,7 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
scope_.get(), scope_.get(),
&feed_targets, &feed_targets,
&fetch_targets, &fetch_targets,
!config_.share_variables); false /* don't create variable eatch time */);
if (!GetFetch(fetchs, output_data)) { if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs"; LOG(ERROR) << "fail to get fetchs";
return false; return false;
@ -133,59 +134,20 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
return true; return true;
} }
std::unique_ptr<PaddlePredictor> PaddlePredictorImpl::Clone() { std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
VLOG(3) << "Predictor::clone"; VLOG(3) << "Predictor::clone";
std::unique_ptr<PaddlePredictor> cls(new PaddlePredictorImpl(config_)); std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
if (!cls->InitShared()) {
LOG(ERROR) << "fail to call InitShared"; if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init()) {
LOG(ERROR) << "fail to call Init";
return nullptr; return nullptr;
} }
// fix manylinux compile error. // fix manylinux compile error.
return std::move(cls); return std::move(cls);
} }
// TODO(panyx0718): Consider merge with Init()? bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
bool PaddlePredictorImpl::InitShared() { std::vector<framework::LoDTensor> *feeds) {
VLOG(3) << "Predictor::init_shared";
// 1. Define place, executor, scope
if (this->config_.device >= 0) {
place_ = platform::CUDAPlace();
} else {
place_ = platform::CPUPlace();
}
this->executor_.reset(new framework::Executor(this->place_));
this->scope_.reset(new framework::Scope());
// Initialize the inference program
if (!this->config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
this->inference_program_ = inference::Load(
this->executor_.get(), this->scope_.get(), this->config_.model_dir);
} else if (!this->config_.prog_file.empty() &&
!this->config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
this->inference_program_ = inference::Load(this->executor_.get(),
this->scope_.get(),
this->config_.prog_file,
this->config_.param_file);
}
this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
// 3. create variables
// TODO(panyx0718): why test share_variables.
if (config_.share_variables) {
this->executor_->CreateVariables(
*this->inference_program_, this->scope_.get(), 0);
}
// 4. Get the feed_target_names and fetch_target_names
this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
return true;
}
bool PaddlePredictorImpl::SetFeed(const std::vector<PaddleTensor> &inputs,
std::vector<framework::LoDTensor> *feeds) {
VLOG(3) << "Predictor::set_feed"; VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feed_target_names_.size()) { if (inputs.size() != feed_target_names_.size()) {
LOG(ERROR) << "wrong feed input size."; LOG(ERROR) << "wrong feed input size.";
@ -213,7 +175,7 @@ bool PaddlePredictorImpl::SetFeed(const std::vector<PaddleTensor> &inputs,
return true; return true;
} }
bool PaddlePredictorImpl::GetFetch( bool NativePaddlePredictor::GetFetch(
const std::vector<framework::LoDTensor> &fetchs, const std::vector<framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *outputs) { std::vector<PaddleTensor> *outputs) {
VLOG(3) << "Predictor::get_fetch"; VLOG(3) << "Predictor::get_fetch";
@ -280,23 +242,29 @@ bool PaddlePredictorImpl::GetFetch(
} }
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor( std::unique_ptr<PaddlePredictor>
const ConfigImpl &config) { CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
VLOG(3) << "create PaddlePredictorImpl"; const NativeConfig &config) {
// 1. GPU memeroy VLOG(3) << "create NativePaddlePredictor";
std::vector<std::string> flags; if (config.use_gpu) {
if (config.fraction_of_gpu_memory >= 0.0f || // 1. GPU memeroy
config.fraction_of_gpu_memory <= 0.95f) { PADDLE_ENFORCE(
flags.push_back("dummpy"); config.fraction_of_gpu_memory > 0.f,
std::string flag = "--fraction_of_gpu_memory_to_use=" + "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
num2str<float>(config.fraction_of_gpu_memory); std::vector<std::string> flags;
flags.push_back(flag); if (config.fraction_of_gpu_memory >= 0.0f ||
VLOG(3) << "set flag: " << flag; config.fraction_of_gpu_memory <= 0.95f) {
framework::InitGflags(flags); flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
num2str<float>(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
}
} }
std::unique_ptr<PaddlePredictor> predictor(new PaddlePredictorImpl(config)); std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
if (!dynamic_cast<PaddlePredictorImpl *>(predictor.get())->Init()) { if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init()) {
return nullptr; return nullptr;
} }
return std::move(predictor); return std::move(predictor);

@ -29,17 +29,10 @@
namespace paddle { namespace paddle {
struct ConfigImpl : public PaddlePredictor::Config { class NativePaddlePredictor : public PaddlePredictor {
int device;
float fraction_of_gpu_memory;
std::string prog_file;
std::string param_file;
bool share_variables;
};
class PaddlePredictorImpl : public PaddlePredictor {
public: public:
explicit PaddlePredictorImpl(const ConfigImpl &config) : config_(config) {} explicit NativePaddlePredictor(const NativeConfig &config)
: config_(config) {}
bool Init(); bool Init();
@ -48,16 +41,15 @@ class PaddlePredictorImpl : public PaddlePredictor {
std::unique_ptr<PaddlePredictor> Clone() override; std::unique_ptr<PaddlePredictor> Clone() override;
~PaddlePredictorImpl() override{}; ~NativePaddlePredictor() override{};
private: private:
bool InitShared() override;
bool SetFeed(const std::vector<PaddleTensor> &input_datas, bool SetFeed(const std::vector<PaddleTensor> &input_datas,
std::vector<framework::LoDTensor> *feeds); std::vector<framework::LoDTensor> *feeds);
bool GetFetch(const std::vector<framework::LoDTensor> &fetchs, bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *output_data); std::vector<PaddleTensor> *output_data);
ConfigImpl config_; NativeConfig config_;
platform::Place place_; platform::Place place_;
std::unique_ptr<framework::Executor> executor_; std::unique_ptr<framework::Executor> executor_;
std::unique_ptr<framework::Scope> scope_; std::unique_ptr<framework::Scope> scope_;

@ -40,19 +40,19 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
return pt; return pt;
} }
ConfigImpl GetConfig() { NativeConfig GetConfig() {
ConfigImpl config; NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; config.model_dir = FLAGS_dirname + "word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
config.use_gpu = true;
config.device = 0; config.device = 0;
config.share_variables = true;
return config; return config;
} }
TEST(paddle_inference_api_impl, word2vec) { TEST(paddle_inference_api_impl, word2vec) {
ConfigImpl config = GetConfig(); NativeConfig config = GetConfig();
std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor<NativeConfig>(config);
framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}}; framework::LoD lod{{0, 1}};
@ -104,7 +104,7 @@ TEST(paddle_inference_api_impl, image_classification) {
int batch_size = 2; int batch_size = 2;
bool use_mkldnn = false; bool use_mkldnn = false;
bool repeat = false; bool repeat = false;
ConfigImpl config = GetConfig(); NativeConfig config = GetConfig();
config.model_dir = config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model"; FLAGS_dirname + "image_classification_resnet.inference.model";
@ -133,7 +133,7 @@ TEST(paddle_inference_api_impl, image_classification) {
is_combined, is_combined,
use_mkldnn); use_mkldnn);
std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config);
std::vector<PaddleTensor> paddle_tensor_feeds; std::vector<PaddleTensor> paddle_tensor_feeds;
paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input)); paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
@ -144,8 +144,7 @@ TEST(paddle_inference_api_impl, image_classification) {
float* data = static_cast<float*>(outputs[0].data.data); float* data = static_cast<float*>(outputs[0].data.data);
float* lod_data = output1.data<float>(); float* lod_data = output1.data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_LT(lod_data[j] - data[j], 1e-10); EXPECT_NEAR(lod_data[j], data[j], 1e-3);
EXPECT_GT(lod_data[j] - data[j], -1e-10);
} }
free(data); free(data);
} }

@ -200,7 +200,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
vars_[var_desc.name()].reset(new VarDesc(var_desc)); vars_[var_desc.name()].reset(new VarDesc(var_desc));
} }
for (const proto::OpDesc &op_desc : desc_->ops()) { for (const proto::OpDesc &op_desc : desc_->ops()) {
ops_.emplace_back(new OpDesc(op_desc, prog, this)); ops_.emplace_back(new OpDesc(op_desc, this));
} }
} }
@ -209,7 +209,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
: prog_(prog), desc_(desc) { : prog_(prog), desc_(desc) {
need_update_ = true; need_update_ = true;
for (auto &op : other.ops_) { for (auto &op : other.ops_) {
ops_.emplace_back(new OpDesc(*op->Proto(), prog, this)); ops_.emplace_back(new OpDesc(*op, this));
} }
for (auto &it : other.vars_) { for (auto &it : other.vars_) {
auto *var = new VarDesc(*it.second); auto *var = new VarDesc(*it.second);

@ -105,7 +105,7 @@ class BlockDesc {
size_t OpSize() const { return ops_.size(); } size_t OpSize() const { return ops_.size(); }
OpDesc *Op(int idx) { return ops_.at(idx).get(); } OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
void Flush(); void Flush();

@ -11,11 +11,15 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include <algorithm>
#include <fstream> #include <fstream>
#include <string>
#include <utility> #include <utility>
#include <vector>
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
@ -26,9 +30,6 @@
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif #endif
#include <string>
#include <vector>
DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot", DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
"the ssa graph path only print with GLOG_v=10," "the ssa graph path only print with GLOG_v=10,"
"default /tmp/graph.dot"); "default /tmp/graph.dot");
@ -148,9 +149,9 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const { const ProgramDesc &program) const {
std::unordered_map<std::string, proto::VarType::Type> var_types; std::unordered_map<std::string, VarDesc *> all_vars;
for (auto *var : program.Block(0).AllVars()) { for (auto *var : program.Block(0).AllVars()) {
var_types[var->Name()] = var->GetType(); all_vars[var->Name()] = var;
} }
auto graph = new SSAGraph(); auto graph = new SSAGraph();
@ -167,12 +168,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
auto send_vars = FindDistTrainSendVars(program); auto send_vars = FindDistTrainSendVars(program);
auto recv_vars = FindDistTrainRecvVars(program); auto recv_vars = FindDistTrainRecvVars(program);
size_t cur_device_id = 0;
std::vector<std::unordered_set<std::string>> var_name_on_devices; std::vector<std::unordered_set<std::string>> var_name_on_devices;
std::vector<std::unordered_set<std::string>> bcast_var_name_set; std::vector<std::unordered_set<std::string>> bcast_var_name_set;
var_name_on_devices.resize(places_.size()); var_name_on_devices.resize(places_.size());
bcast_var_name_set.resize(places_.size()); bcast_var_name_set.resize(places_.size());
size_t cur_device_id = 0;
std::vector<int64_t> balance_grads(places_.size(), 0);
auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
auto var_desc = all_vars.at(g_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GE(numel, 0);
auto smallest =
std::min_element(std::begin(balance_grads), std::end(balance_grads));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
balance_grads[dev_id] += numel;
return dev_id;
};
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
if (boost::get<int>( if (boost::get<int>(
@ -220,13 +237,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
switch (strategy_.reduce_) { switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce: case BuildStrategy::ReduceStrategy::kReduce:
cur_device_id = get_appropriate_dev(g_name);
CreateReduceOp(&result, g_name, cur_device_id); CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices[cur_device_id].emplace(g_name); var_name_on_devices[cur_device_id].emplace(g_name);
bcast_var_name_set[cur_device_id].emplace(p_name); bcast_var_name_set[cur_device_id].emplace(p_name);
cur_device_id = (cur_device_id + 1) % places_.size();
break; break;
case BuildStrategy::ReduceStrategy::kAllReduce: case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(var_types, g_name)) { if (IsSparseGradient(all_vars, g_name)) {
CreateReduceOp(&result, g_name, 0); CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0); CreateBroadcastOp(&result, g_name, 0);
} else { } else {
@ -269,10 +286,10 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} }
bool MultiDevSSAGraphBuilder::IsSparseGradient( bool MultiDevSSAGraphBuilder::IsSparseGradient(
const std::unordered_map<std::string, proto::VarType::Type> &var_types, const std::unordered_map<std::string, VarDesc *> &all_vars,
const std::string &og) const { const std::string &og) const {
PADDLE_ENFORCE(var_types.count(og) != 0); PADDLE_ENFORCE(all_vars.count(og) != 0);
if (var_types.at(og) == proto::VarType::SELECTED_ROWS) { if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
return true; return true;
} }
return false; return false;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save