Merge remote-tracking branch 'upstream/develop' into merge

revert-3824-remove_grad_op_type
tensor-tang 8 years ago
commit 2efac83aaa

@ -22,7 +22,7 @@
- id: clang-format-with-version-check
name: clang-format
description: Format files with ClangFormat.
entry: ./.clang_format.hook -i
entry: bash ./.clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang

@ -55,6 +55,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
@ -137,9 +138,9 @@ set(EXTERNAL_LIBS
)
if(WITH_GPU)
list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(NOT WITH_DSO)
endif(WITH_GPU)

@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU
ARG WITH_AVX
ARG WITH_DOC
ARG WITH_STYLE_CHECK
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-OFF}
ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF}
ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
ENV HOME /root
# Add bash enhancements

@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)

@ -2,7 +2,7 @@ if(NOT WITH_GPU)
return()
endif()
set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}

@ -257,6 +257,11 @@ seq_concat
.. autoclass:: paddle.v2.layer.seq_concat
:noindex:
seq_slice
---------
.. autoclass:: paddle.v2.layer.seq_slice
:noindex:
kmax_sequence_score
-------------------
.. autoclass:: paddle.v2.layer.kmax_sequence_score
@ -362,6 +367,11 @@ trans
.. autoclass:: paddle.v2.layer.trans
:noindex:
scale_shift
-----------
.. autoclass:: paddle.v2.layer.scale_shift
:noindex:
Sampling Layers
===============

@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below:
<img src="src/paddle-task-states.png"/>
1. When a new pass of training starts, all tasks will be placed in the todo queue.
1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer.
1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
1. If a task fails for any reason in trainer, or takes longer than a specific period of time, the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
### Trainer Process
The trainer process will:
- Receive tasks from the master.
- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
- Request tasks from the master.
- Work on the tasks
- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
### Parameter Server Process
@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at
1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
1. Write its ip address to */master/addr* so that trainers can discover it.
1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i
When the trainer is started by the Kubernetes, it executes the following steps at startup:
1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
1. Waits for tasks from the master to start training.
1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
1. Finds and watches */master/addr* to get master's address.
1. Requests for tasks from the master to start training.
If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again.
When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
### Parameter Server Process

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

After

Width:  |  Height:  |  Size: 49 KiB

@ -0,0 +1,124 @@
# 编译PaddlePaddle和运行单元测试
## 需要的软硬件
为了开发PaddlePaddle我们需要
1. 一台电脑,可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统,以及
1. Docker。
不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker image 里。
## 总体流程
1. 获取源码
```bash
git clone https://github.com/paddlepaddle/paddle
```
2. 安装开发工具到 Docker image 里
```bash
cd paddle; docker build -t paddle:dev .
```
请注意这个命令结尾处的 `.`;它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile),按照其内容创建一个名为 `paddle:dev` 的 Docker image并且把各种开发工具安装进去。
3. 编译
以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image同时把当前目录源码树根目录映射为 container 里的 `/paddle` 目录,并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake``make` 来编译 `/paddle` 里的源码,结果输出到 `/paddle/build`,也就是本地的源码树根目录里的 `build` 子目录。
```bash
docker run --rm -v $PWD:/paddle paddle:dev
```
上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本,可以用
```bash
docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
```
4. 运行单元测试
用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试:
```bash
NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
如果编译的时候我们用了 `WITH_GPU=OFF` 选项,那么编译过程只会产生 CPU-based 单元测试,那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要:
```bash
docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
有时候我们只想运行一个特定的单元测试,比如 `memory_test`,我们可以
```bash
nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
```
5. 清理
有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要:
```bash
rm -rf build
```
## 为什么要 Docker 呀?
- 什么是 Docker?
如果您没有听说 Docker可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
- Docker 还是虚拟机?
有人用虚拟机来类比 Docker。需要强调的是Docker 不会虚拟任何硬件Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
- 为什么用 Docker?
把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
另外对于习惯使用Windows和MacOS的开发者来说使用Docker就不用配置交叉编译环境了。
- 我可以选择不用Docker吗
当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
- 学习 Docker 有多难?
理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
- 我可以用 IDE 吗?
当然可以因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
```emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
就可以按 `Ctrl-C``c` 键来启动编译了。
- 可以并行编译吗?
是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
## 可能碰到的问题
- Docker 需要 sudo
如果用自己的电脑开发自然也就有管理员权限sudo了。如果用公用的电脑开发需要请管理员安装和配置好 Docker。此外PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
- 在 Windows/MacOS 上编译很慢
Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
- 磁盘不够
本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。

@ -0,0 +1,124 @@
# Build PaddlePaddle from Source Code and Run Unit Test
## What Developers Need
To contribute to PaddlePaddle, you need
1. A computer -- Linux, BSD, Windows, MacOS, and
1. Docker.
Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image.
## General Process
1. Retrieve source code.
```bash
git clone https://github.com/paddlepaddle/paddle
```
2. Install build tools into a Docker image.
```bash
cd paddle; docker build -t paddle:dev .
```
Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
3. Build from source.
This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile. `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
```bash
docker run -v $PWD:/paddle paddle:dev
```
Above command builds a CUDA-enabled version. If we want to build a CPU-only version, we can type
```bash
docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
```
4. Run unit tests.
To run all unit tests using the first GPU of a node:
```bash
NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them. We can just run
```bash
docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
Sometimes we want to run a specific unit test, say `memory_test`, we can run
```bash
nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
```
5. Clean Build.
Sometimes, we might want to clean all thirt-party dependents and built binaries. To do so, just
```bash
rm -rf build
```
## Docker, Or Not?
- What is Docker?
If you haven't heard of it, consider it something like Python's virtualenv.
- Docker or virtual machine?
Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
- Why Docker?
Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
- Can I choose not to use Docker?
Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer. This document exists because Docker would make the development way easier.
- How difficult is it to learn Docker?
It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
- Can I use my favorite IDE?
Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like.
Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file:
```emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
- Does Docker do parallel building?
Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
## Some Gotchas
- Docker requires sudo
An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo.
- Docker on Windows/MacOS builds slowly
On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
- Not enough disk space
Examples in this article uses option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).

@ -19,6 +19,7 @@
.. toctree::
:maxdepth: 1
dev/build_cn.rst
dev/write_docs_cn.rst
dev/contribute_to_paddle_cn.md

@ -18,6 +18,7 @@ Development
.. toctree::
:maxdepth: 1
dev/build_en.rst
dev/new_layer_en.rst
dev/contribute_to_paddle_en.md

@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error {
// WithEtcd sets the client to use etcd for master discovery.
func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
return func(c *Client) error {
cli, err := clientv3.New(clientv3.Config{
Endpoints: endpoints,
DialTimeout: timeout,
})
if err != nil {
var cli *clientv3.Client
f := func() error {
var err error
cli, err = clientv3.New(clientv3.Config{
Endpoints: endpoints,
DialTimeout: timeout,
})
return err
}
for {
err := f()
if err != nil {
log.Warningln(err)
} else {
break
}
time.Sleep(time.Second)
}
ch := make(chan string, 1)
a, err := GetKey(cli, DefaultAddrPath, timeout)
@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
}
}
c.ch = make(chan record, c.bufSize)
// FIXME: connection is created asyncrosly in monitorMaster go routine,
// ensure the connection is ready for use before calling c.addClient.
time.Sleep(time.Second)
return c, nil
}

@ -15,6 +15,7 @@ if(Boost_FOUND)
add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
endif()
if(WITH_C_API)

@ -53,7 +53,10 @@ add_custom_target(paddle_capi_whole ALL
set_target_properties(paddle_capi_whole
PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
link_paddle_exe(paddle_capi_shared)

@ -0,0 +1,6 @@
{
global:
paddle_*;
local:
*;
};

@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo,
size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes);
size_t* bwdFilterLimitBytes,
bool useDilation);
/**
* @brief destroy filter descriptor.
@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width);
int stride_width,
int dilation_h = 1,
int dilation_w = 1);
/**
* @brief reset convolution descriptor.
@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width);
int stride_width,
int dilation_h = 1,
int dilation_w = 1);
/**
* @brief destroy convolution descriptor.

@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {}
int stride_width,
int dilation_h,
int dilation_w) {}
inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
hl_tensor_descriptor image,
@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {}
int stride_width,
int dilation_h,
int dilation_w) {}
inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo,
size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes) {}
size_t* bwdFilterLimitBytes,
bool useDilation) {}
inline void hl_convolution_forward(hl_tensor_descriptor input,
real* input_data,

@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo,
size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes) {
size_t* bwdFilterLimitBytes,
bool useDilation) {
#if CUDNN_VERSION >= 4000
CHECK_NOTNULL(input);
@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input,
size_t memoryLimitBytes =
(1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
// For dilation
int algo = 0;
// cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
// cudnn convolution backward data configuration
cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnConvolutionDescriptor_t bwd_data_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
// cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
t_resource.cudnn_handle,
fwd_src_desc,
fwd_filter_desc,
fwd_conv_desc,
fwd_dest_desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
if (useDilation) {
convFwdAlgo = &algo;
convBwdDataAlgo = &algo;
convBwdFilterAlgo = &algo;
} else {
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
t_resource.cudnn_handle,
fwd_src_desc,
fwd_filter_desc,
fwd_conv_desc,
fwd_dest_desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
}
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
t_resource.cudnn_handle,
@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
fwdLimitBytes));
// cudnn convolution backward data configuration
cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnConvolutionDescriptor_t bwd_data_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
t_resource.cudnn_handle,
bwd_data_filter_desc,
@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
bwdDataLimitBytes));
// cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
t_resource.cudnn_handle,
bwd_filter_src_desc,
@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {
int stride_width,
int dilation_h,
int dilation_w) {
CHECK_NOTNULL(conv);
cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode,
data_type));
#else
if (dilation_h > 1 || dilation_w > 1) {
LOG(FATAL)
<< "Current cuDNN version does't support for dilation convolution. "
<< "The dilation convolution requires cuDNN >= v6.0.";
}
CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
padding_height,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode));
#endif
@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {
int stride_width,
int dilation_h,
int dilation_w) {
CHECK_NOTNULL(conv);
CHECK_NOTNULL(image);
CHECK_NOTNULL(filter);
@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode,
data_type));
#else
@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode));
#endif

@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
proto_library(framework_proto SRCS framework.proto)
cc_library(attribute SRCS attribute.cc DEPS framework_proto)
cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
@ -39,21 +39,3 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
if(WITH_PYTHON)
cc_library(paddle_pybind SHARED
SRCS pybind.cc
DEPS pybind python backward
sgd_op
add_op
mul_op
rowwise_add_op
sigmoid_op
softmax_op
mean_op
cross_entropy_op
recurrent_op
uniform_random_op
gaussian_random_op
fill_zeros_like_op)
endif(WITH_PYTHON)

@ -110,7 +110,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
dup_output_ops[out].emplace_back(local_op_id);
return false;
});
net->AddOp(std::move(bwd));
net->AppendOp(std::move(bwd));
}
// Get unique ID for this method.
auto uid = uniq_id++;
@ -163,8 +163,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
// If part of input gradient of that operator is not calculated, fill
// zero variables to that input gradient.
net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {{"Src", {prefix}}},
{{"Dst", {grad_input}}}, {}));
net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
{{"Src", {prefix}}},
{{"Dst", {grad_input}}}, {}));
}
return false;
});
@ -195,7 +196,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
if (net->ops_.empty()) { // Current no aux op is added to network
return grad_op;
}
net->AddOp(std::move(grad_op));
net->AppendOp(std::move(grad_op));
}
net->SetType("@GENERATED_BACKWARD@");
net->CompleteAddOp();

@ -21,18 +21,32 @@ grad_op_builder(fengjiayi)
given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
1. bla bla bla (yuyang)
1. Op
when the input forward network is a Op, return its gradient Operator Immediately.
2. NetOp
when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
**shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.
<p align="center">
<img src="./images/duplicate_op.png" width="70%" ><br/>
1. shared variable in two operators.
</p>
Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links.
<p align="center">
<img src="images/duplicate_op2.png" width="90%" ><br/>
We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.
2. replace shared variable gradient with `Add` Operator
![./images/duplicate_op]()
</p>
Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead.
![./images/duplicate_op2]()
Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.

@ -72,16 +72,16 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
class FcOp : public operators::NetOp {
public:
FcOp(const std::string &type, const VarNameMap &inputs,
const VarNameMap &outputs, const AttributeMap &attrs)
FcOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
AddOp(OpRegistry::CreateOp("mul",
{{"X", {Input("X")}}, {"Y", {Input("W")}}},
{{"Out", {Output("mul_result")}}}, {}));
AppendOp(OpRegistry::CreateOp("mul",
{{"X", {Input("X")}}, {"Y", {Input("W")}}},
{{"Out", {Output("mul_result")}}}, {}));
auto input_b = Inputs("b");
std::string before_act = "mul_result";
if (input_b.size() != 0) {
AddOp(OpRegistry::CreateOp(
AppendOp(OpRegistry::CreateOp(
"rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
{{"Out", {Output("add_result")}}}, {}));
before_act = "add_result";
@ -92,8 +92,8 @@ class FcOp : public operators::NetOp {
}
}
AddOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
{{"Out", {Output("Out")}}}, {}));
AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
{{"Out", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
@ -234,13 +234,13 @@ TEST(Backward, net_fc_backward_not_have_b) {
TEST(Backward, net_input_of_network_not_need_grad) {
ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp(
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_tmp_0"}},
{"add_result", {"add_tmp_0"}},
{"Out", {"hidden0"}}},
{}));
net.AddOp(f::OpRegistry::CreateOp(
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_tmp_1"}},
{"add_result", {"add_tmp_1"}},
@ -273,10 +273,10 @@ TEST(Backward, net_input_of_network_not_need_grad) {
TEST(Backward, net_shared_weight) {
ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
{{"Out", {"out"}}}, {}));
net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
{{"Out", {"FinalOut"}}}, {}));
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
{{"Out", {"out"}}}, {}));
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
{{"Out", {"FinalOut"}}}, {}));
net.CompleteAddOp();
auto bwd = f::Backward(net, {});
@ -357,19 +357,19 @@ TEST(Backward, op_part_of_input_are_not_need) {
TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp(
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_out1"}},
{"add_result", {"add_out1"}},
{"Out", {"out1"}}},
{}));
net.AddOp(f::OpRegistry::CreateOp(
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_out2"}},
{"add_result", {"tmp_out2"}},
{"Out", {"out2"}}},
{}));
net.AddOp(f::OpRegistry::CreateOp(
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
{{"mul_result", {"mul_out3"}},
{"add_result", {"tmp_out3"}},

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save