Resolve conflicts.

cross_channel_norm
dangqingqing 7 years ago
commit 10622ba3cf

@ -2,8 +2,8 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop) [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -36,7 +36,7 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
examples: examples:
- Optimized math operations through SSE/AVX intrinsics, BLAS libraries - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
(e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels. (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
- Highly optimized recurrent networks which can handle **variable-length** - Highly optimized recurrent networks which can handle **variable-length**
sequence without padding. sequence without padding.
- Optimized local and distributed training for models with high dimensional - Optimized local and distributed training for models with high dimensional

@ -3,7 +3,7 @@
# It will search MKLML, atlas, OpenBlas, reference-cblas in order. # It will search MKLML, atlas, OpenBlas, reference-cblas in order.
# #
# If any cblas implementation found, the following variable will be set. # If any cblas implementation found, the following variable will be set.
# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE # CBLAS_PROVIDER # one of MKLML, OPENBLAS, REFERENCE
# CBLAS_INC_DIR # the include directory for cblas. # CBLAS_INC_DIR # the include directory for cblas.
# CBLAS_LIBS # a list of libraries should be linked by paddle. # CBLAS_LIBS # a list of libraries should be linked by paddle.
# # Each library should be full path to object file. # # Each library should be full path to object file.
@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
set(CBLAS_INC_DIR ${MKLML_INC_DIR}) set(CBLAS_INC_DIR ${MKLML_INC_DIR})
set(CBLAS_LIBRARIES ${MKLML_LIB}) set(CBLAS_LIBRARIES ${MKLML_LIB})
add_definitions(-DPADDLE_USE_MKLML) add_definitions(-DPADDLE_WITH_MKLML)
add_definitions(-DLAPACK_FOUND) add_definitions(-DLAPACK_FOUND)
message(STATUS "Found cblas and lapack in MKLML " message(STATUS "Found cblas and lapack in MKLML "
@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
return() return()
endif() endif()
## Then find atlas.
set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
set(ATLAS_INCLUDE_SEARCH_PATHS
${ATLAS_ROOT}/include
/usr/include
/usr/include/atlas)
set(ATLAS_LIB_SEARCH_PATHS
${ATLAS_ROOT}/lib
/usr/lib
/usr/lib/blas/atlas
/usr/lib/atlas
/usr/lib/atlas-base # special for ubuntu 14.04.
)
find_path(ATLAS_INC_DIR NAMES cblas.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER ATLAS)
set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
add_definitions(-DPADDLE_USE_ATLAS)
add_definitions(-DLAPACK_FOUND)
message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
return()
endif()
## Then find openblas. ## Then find openblas.
set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas") set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
set(OPENBLAS_INCLUDE_SEARCH_PATHS set(OPENBLAS_INCLUDE_SEARCH_PATHS

@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_USE_MKLDNN) add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn) LIST(APPEND external_project_dependencies mkldnn)

@ -7,3 +7,4 @@ API
模型配置 <v2/model_configs.rst> 模型配置 <v2/model_configs.rst>
数据访问 <v2/data.rst> 数据访问 <v2/data.rst>
训练与应用 <v2/run_logic.rst> 训练与应用 <v2/run_logic.rst>
v2/fluid.rst

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

@ -0,0 +1,122 @@
# Design Doc: PaddlePaddle Fluid
## Why Fluid
When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
## The Evolution of Deep Learning Systems
Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
| Existed since | model as sequence of layers | model as graph of operators | No model |
|--|--|--|--|
| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
## Deep Learning Programming Paradigms
With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
```python
x = layer.data("image")
l = layer.data("label")
f = layer.fc(x, W)
s = layer.softmax(f)
c = layer.mse(l, s)
for i in xrange(1000): # train for 1000 iterations
m = read_minibatch()
forward({input=x, data=m}, minimize=c)
backward(...)
print W # print the trained model parameters.
```
The above program includes two parts:
1. The first part describes the model, and
2. The second part describes the training process (or inference process) for the model.
This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following:
```python
W = tensor(...)
for i in xrange(1000): # train for 1000 iterations
m = read_minibatch()
x = m["image"]
l = m["label"]
f = layer.fc(x, W)
s = layer.softmax(f)
c = layer.mse(l, s)
backward()
print W # print the trained model parameters.
```
We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
## Describe Arbitrary Models for the Future
Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following:
```python
for i in xrange(1000):
m = read_minibatch()
x = m["sentence"]
for t in xrange x.len():
h[t] = the_step(x[t])
```
With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
```python
train_loop = layers.While(cond)
with train_loop.block():
m = read_minibatch()
x = m["sentence"]
rnn = layers.While(...)
with rnn.block():
h[t] = the_step(input[t])
```
An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
## Turing Completeness
In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
## The Execution of a Fluid Program
There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
## Backward Compatibility of Fluid
Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
## Towards a Deep Learning Language and the Compiler
We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`. Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel. More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server. For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer. The following figure explains the proposed two-stage process:
![](fluid-compiler.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 13 KiB

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Before

Width:  |  Height:  |  Size: 11 KiB

After

Width:  |  Height:  |  Size: 11 KiB

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 18 KiB

Before

Width:  |  Height:  |  Size: 10 KiB

After

Width:  |  Height:  |  Size: 10 KiB

@ -0,0 +1,95 @@
# Intel® MKL Packed on PaddlePaddle: Design Doc
## Contents
- [Overview](#overview)
- [Key Points](#key-points)
- [Background](#background)
- [Solution](#solution)
- [Actions](#actions)
- [CMake](#cmake)
- [Layers](#layers)
- [Unit Tests](#unit-tests)
- [Python API](#python-api)
- [Benchmarking](#benchmarking)
## Overview
我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中充分发挥英特尔平台的优势有效提升PaddlePaddle在英特尔架构上的性能。
现阶段的优化主要针对 Recurrent Neural Network以下简称RNN相关层包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer` 以及 PaddlePaddle V1 API。
## Key Points
### Background
目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数,这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
1. 转换耗时 \
这一数据格式的转换操作Packing在问题本身的计算量比较小的时候显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中矩阵大小是`batch_size * 2048`。
2. 转换冗余 \
由于在现有的某些情况下例如RNN多次调用 cblas_?gemm 会使用相同的原数据因此每次调用时对原数据的重复Packing便成为了冗余。
为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时Intel® MKL 引入了以下四个API:
* cblas_?gemm_alloc
* cblas_?gemm_pack
* cblas_?gemm_compute
* cblas_?gemm_free
通过使用这些API我们可以先完成对原数据的Packing操作再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数从而避免了Packing冗余。
### Solution
在RNN的情况下同一次前向、后向forward/backward过程中所有时间步time step共享同一个权重weight。当只做推断inference各次前向之间也都使用了相同的权重没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
我们通过使用新引入的GEMM Packed APIs在层初始化的时候先完成对权重的Packing操作然后在前向后向时复用已经转换过的权重并在每次权重更新后对新的权重进行转换用于下次迭代。
* 优化前对于序列长度sequence length为`T`的网络模型model, `N`次迭代执行的转换次数为:
- `inference` `N * T`
- `training` `2 * N * T`
* 优化后,对于同样设置的网络模型,其转换次数减少至:
- `inference` `1`
- `training` `2 * N`
## Actions
添加的相关文件和目录结构如下:
```txt
PaddlePaddle/Paddle
├── ...
└── paddle/
├── ...
└── gserver/
├── ...
├── layers/
│ ├── ...
│ ├── MKLPackedRecurrentLayer.*
| ├── MKLPackedGatedRecurrentLayer.*
| ├── MKLPackedLstmLayer.*
| └── MKLPackedGemm.h
└── tests/
├── ...
└── test_MKLPacked.cpp
```
### CMake
在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开来决定是否开启MKL Packed相关功能。
### Layers
所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`该文件对相关GEMM Packed APIs做了封装。
### Unit Tests
我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
对于每一个新加的RNN layer我们会对比如下2个方面
1. 对比优化后layer自身sequence mode`rnn_use_batch=false`与batch mode(`rnn_use_batch=true`)的结果。
2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
### Python API
TBD
### Benchmarking
会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
## References
1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)

@ -208,4 +208,3 @@ if use_mkldnn
但是在PaddlePaddle中无论是重构前的layer还是重构后的op都不会想要知道next layer/op的信息。 但是在PaddlePaddle中无论是重构前的layer还是重构后的op都不会想要知道next layer/op的信息。
4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。 4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。
所以需要引入一个转换方法并且只需要在必要的时候转换这种格式才能更好的发挥MKL-DNN的性能。 所以需要引入一个转换方法并且只需要在必要的时候转换这种格式才能更好的发挥MKL-DNN的性能。

@ -0,0 +1,65 @@
# Design Doc: NCCL support in Paddle Fluid
## Abstract
This Design Doc refers to the NCCL feature in paddle. We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
## Motivation
[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel.
- Pros
1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
1. high performance in NVIDIA GPUs.
1. MPI like primitives, which have low learning cost for users.
- Cons
1. Only design for NVIDIA GPUs, not a general multi-device solution.
1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
Besides, it needs interfaces to synchronize model update with each different GPU Cards.
## Implementation
As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
### Transpiler
To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
1. The user-defined model will be a single device program
2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
*Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
<img src="images/multigpu_before_convert.png" width="300"/>
After compiling, the graph as shows
<img src="images/multigpu_allreduce.png" width="1000"/>
Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc.
- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based communicating method, avoid of the bottle neck in a single GPU.
Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
- **AllReduce**
Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
4. Then the root card will optimize the parameter.
5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
6. Finish the sychronization round.
The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.

@ -0,0 +1,248 @@
# Design Doc: Supporting new Device/Library
## Background
Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
So, how to support a new Device/Library in Fluid becomes a challenge.
## Basic: Integrate A New Device/Library
For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
There are mainly three parts that we have to consider while integrating a new device/library:
- Place and DeviceContext: indicates the device id and manages hardware resources
- Memory and Tensor: malloc/free data on certain device
- Math Functor and OpKernel: implement computing unit on certain devices/libraries
### Place and DeviceContext
#### Place
Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
```
| CPUPlace --> MKLDNNPlace
Place --| CUDAPlace --> CUDNNPlace
| FPGAPlace
```
And `Place` is defined as follows:
```
typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
```
#### DeviceContext
Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
```
/-> CPUDeviceContext --> MKLDeviceContext
DeviceContext ----> CUDADeviceContext --> CUDNNDeviceContext
\-> FPGADeviceContext
```
An example of Nvidia GPU is as follows:
- DeviceContext
```
class DeviceContext {
virtual Place GetPlace() const = 0;
};
```
- CUDADeviceContext
```
class CUDADeviceContext : public DeviceContext {
Place GetPlace() const override { return place_; }
private:
CUDAPlace place_;
cudaStream_t stream_;
cublasHandle_t cublas_handle_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_; // binds with stream_
};
```
- CUDNNDeviceContext
```
class CUDNNDeviceContext : public CUDADeviceContext {
private:
cudnnHandle_t cudnn_handle_;
};
```
### Memory and Tensor
#### memory module
Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
```
template <typename Place>
void* Alloc(Place place, size_t size);
template <typename Place>
void Free(Place place, void* ptr);
template <typename Place>
size_t Used(Place place);
```
To implementing these interfaces, we have to implement MemoryAllocator for different Devices
#### Tensor
[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
```cpp
class Tensor {
public:
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T* data();
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
*
* @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block.
*
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(DDim dims, platform::Place place);
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
private:
/*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_;
/*! points to dimensions of memory block. */
DDim dim_;
};
```
`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
```cpp
paddle::framework::Tensor t;
paddle::platform::CPUPlace place;
// set size first
t.Resize({2, 3});
// allocate memory on CPU later
t.mutable_data(place);
```
### Math Functor and OpKernel
Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
The interface is defined in header file.
```
template <typename DeviceContext, typename T>
class MaxOutFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
framework::Tensor* output, int groups);
};
```
CPU implemention is in .cc file
```
template <typename T>
class MaxOutFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* output,
int groups) {
...
}
};
```
CUDA implemention is in .cu file
```
template <typename T>
class MaxOutFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* output,
int groups) {
...
}
};
```
We get computing handle from a concrete DeviceContext, and make compution on tensors.
The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
Fluid provides different register interfaces in op_registry.h
Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
In .cc file:
```
REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
REGISTER_OP_CPU_KERNEL(
crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
```
In .cu file:
```
REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
REGISTER_OP_CUDA_KERNEL(
crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
```
## Advanced topics: How to switch between different Device/Library
Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
We will discuss how to implement an efficient OpKernel switch policy.
- TBD

@ -14,7 +14,7 @@
$ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
$ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
$ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_

@ -114,7 +114,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
.. code-block:: bash .. code-block:: bash
nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
**注: 如果没有安装nvidia-docker可以尝试以下的方法将CUDA库和Linux设备挂载到Docker容器内** **注: 如果没有安装nvidia-docker可以尝试以下的方法将CUDA库和Linux设备挂载到Docker容器内**
@ -122,7 +122,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
**关于AVX** **关于AVX**

@ -122,7 +122,7 @@ GPU driver installed before move on.
.. code-block:: bash .. code-block:: bash
nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.** **NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
@ -130,7 +130,7 @@ GPU driver installed before move on.
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
**About AVX:** **About AVX:**

@ -0,0 +1,18 @@
import paddle.v2 as paddle
import numpy as np
paddle.init(use_gpu=False)
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# loading the model which generated by training
with open('params_pass_90.tar', 'r') as f:
parameters = paddle.parameters.Parameters.from_tar(f)
# Input multiple sets of dataOutput the infer result in a array.
i = [[[1, 2]], [[3, 4]], [[5, 6]]]
print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
# Will print:
# [[ -3.24491572]
# [ -6.94668722]
# [-10.64845848]]

@ -26,6 +26,11 @@ def event_handler(event):
if event.batch_id % 1 == 0: if event.batch_id % 1 == 0:
print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id, print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
event.cost) event.cost)
# product model every 10 pass
if isinstance(event, paddle.event.EndPass):
if event.pass_id % 10 == 0:
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
# define training dataset reader # define training dataset reader

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save