Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-weight-normalization

emailweixu-patch-1
guosheng 7 years ago
commit a422f34607

@ -9,7 +9,7 @@ import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
if(NOT ANDROID AND NOT IOS)
find_package(Boost QUIET)
endif()
include(simd)
@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
include_directories(${Boost_INCLUDE_DIRS})
set(EXTERNAL_LIBS
${GFLAGS_LIBRARIES}

@ -27,7 +27,7 @@ RUN apt-get update && \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \
liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \
apt-get clean -y

@ -0,0 +1,51 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(BOOST_PROJECT "extern_boost")
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
include_directories(${BOOST_INCLUDE_DIR})
ExternalProject_Add(
${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(boost STATIC ${dummyfile})
else()
add_library(boost INTERFACE)
endif()
add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})

@ -18,6 +18,11 @@ dynamic_lstm
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
:noindex:
data
----
.. autofunction:: paddle.v2.fluid.layers.data
@ -500,6 +505,11 @@ swish
.. autofunction:: paddle.v2.fluid.layers.swish
:noindex:
im2sequence
------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex:
edit_distance
---------------
.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
@ -519,3 +529,13 @@ sequence_reshape
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
:noindex:
row_conv
--------
.. autofunction:: paddle.v2.fluid.layers.row_conv
:noindex:
multiplex
---------
.. autofunction:: paddle.v2.fluid.layers.multiplex
:noindex:

@ -26,8 +26,8 @@ glu
:noindex:
dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
scaled_dot_product_attention
----------------------------
.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
:noindex:

@ -0,0 +1,96 @@
# Design Doc: CSP in PaddlePaddle Fluid
## Motivation
Concurrent programming is important for deep learning. Few example applications are:
1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
## Concurrent Programming Models
There were many concurrent programming models, implemented in various forms:
| concurrent programming model | implementation |
|-----|-----|
| mutex | types and functions in standard libraries |
| semaphore | types and functions in standard libraries |
| communicating sequential processes (CSP) | Go programming language |
| actor model | Erlang programming language |
| message passing | MPI |
| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
### CSP v.s. Actor Model
A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
## CSP in Fluid
Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
1. a new data type: *channel* and operators *send* and *recv*,
1. *goroutine* or thread, and
1. a new control-flow: select.
We also need Python wrappers for the above components.
The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
### Type Channel
Fluid supports many data types:
1. Tensor,
1. Row-sparse Tensor
1. LoD Tensor,
1. Tensor array, etc
Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
## Syntax Design
### Create Channel
In Go, we create a channel by specifying the element type and buffer size:
```go
ch := make(chan int) // a channel without buffer
ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
```
In Fluid, we should be able to do the same:
```python
ch = fluid.make_chan(dtype=INT)
ch1 = fluid.make_chan(dtype=INT, 100)
```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python
ch = fluid.make_chan(dtype=Tensor, etype=float16)
```
or Tensors of Tensors of float16 etc.
The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
### Send and Recv
### Select
## Example Programs
### 1. RPC between Trainers and Parameter Servers
### 2. Concurrent Minibatch Loading

@ -152,12 +152,12 @@ for data in train_reader():
`JobDesc` object describe the distributed job resource specification to run on
Cluster environment.
<img src="src/remote_executor.png"/>
<img src="src/remote_executor.png" width="500" align="center" />
`RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
### Placement Algorithm

@ -9,16 +9,16 @@ different purposes.
## Background
The previous implementations of the parameter server does not run a
The previous implementations of the parameter server do not run a
fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the
trainer and the parameter server.
trainer as well as the parameter server.
It would be great if we can write code once and use them on both the
trainer and the parameter server: reduces code duplication and
improves extensibility. Given that after the current refactor, we are
representing everything as a computing graph on the
trainer. Representing everything as a computing graph on the parameter
It would be great if we can write code once and use them on both: the
trainer and the parameter server, since this reduces code duplication and
improves extensibility. Given that after the current refactoring, we are
representing everything as a computation graph on the
trainer. Representing everything as a computation graph on the parameter
server becomes a natural extension.
## Design
@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following
steps:
1. OP placement: the OPs will be placed on different nodes according
to heuristic that minimizes estimated total computation
to a heuristic that minimizes the estimated total computation
time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer
variable on parameter server workers and everything else on trainer
workers.
1. Add communication OPs to enable the communication between nodes.
@ -47,22 +47,22 @@ After converting:
<img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer program are placed on the parameter server.
1. The parameter variable W and its optimizer program are placed on the parameter server.
1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark
the *Recv* OP runnable automatically).
- *Enueue* enqueues the input variable, it can block until space
- *Enqueue* enqueues the input variable, it can block until space
become available in the queue.
- *Dequeue* outputs configurable numbers of tensors from the
queue. It will block until the queue have the required number of
queue. It will block until the queue has the required number of
tensors.
### Benefits
- Model parallelism become easier to implement: it's an extension to
- Model parallelism becomes easier to implement: it is an extension to
the trainer - parameter server approach. We can have several "Transpilers"
to achieve different goals.
- User-defined optimizer is easier to add - user can now express it as
@ -72,22 +72,22 @@ After converting:
### Challenges
- It's important to balance the parameter shards of on multiple
parameter server. If a single parameter is very big (some
- It is important to balance the parameter shards on multiple
parameter servers. If a single parameter is very big (for example: some
word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends
on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently. See
- In the "Async SGD" figure, the "W" variable on the parameter server
could be read and written concurrently. See
[here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
details about concurrent program in fluid.
details about concurrent program in Fluid.
### Discussion
- Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the
(put the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depending on the
`min_count` attribute), does our current design support it? (similar
question for the *Add* OP)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

After

Width:  |  Height:  |  Size: 118 KiB

@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clarity.
The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python
[[0, 3, 9]
[0, 2, 3, 3, 3, 9]]
@ -119,7 +119,7 @@ def generate():
encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
decoder_input = pd.fc(
act=pd.activation.Linear(),
input=[target_word, encoder_ctx],
input=[target_word, encoder_ctx_expanded],
size=3 * decoder_dim)
gru_out, cur_mem = pd.gru_step(
decoder_input, mem=decoder_mem, size=decoder_dim)

@ -2,9 +2,9 @@
## Background
Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith
There are mainly three parts that we have to consider while integrating a new device/library:
- Place and DeviceContext: indicates the device id and manages hardware resources
- Place and DeviceContext: indicate the device id and manage hardware resources
- Memory and Tensor: malloc/free data on certain device
@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de
### Place and DeviceContext
Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
#### Place
Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
```
| CPUPlace
@ -144,7 +144,7 @@ class Tensor {
};
```
`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
```cpp
paddle::framework::Tensor t;
@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi
Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
The interface is defined in header file.
The interface is defined in the header file.
```
template <typename DeviceContext, typename T>
@ -203,7 +203,7 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> {
```
We get computing handle from a concrete DeviceContext, and make compution on tensors.
We first obtain the computing handle from a concrete DeviceContext, and then compute on tensors.
The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(
## Advanced topics: How to switch between different Device/Library
Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
For more details, please refer to following docs:

@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

@ -60,8 +60,7 @@ each column is as follows:
| column | meaning |
| --- | --- |
| ncalls | the number of calls into a function |
| tottime | the total execution time of the function, not including the
execution time of other functions called by the function |
| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
| percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls |

@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes
PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
``` bash
cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
```
### Update the training script
#### Non-cluster training script
@ -119,7 +125,14 @@ for pass_id in range(100):
### E2E demo
Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line:
Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
First `cd` into the folder that contains the `python` files. In this case:
```bash
cd /paddle/python/paddle/v2/fluid/tests/book_distribute
```
In parameter server node run the following in the command line:
``` bash
PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py

@ -18,7 +18,7 @@ else()
add_subdirectory(capi)
endif()
if(Boost_FOUND)
if(NOT ANDROID AND NOT IOS)
add_subdirectory(memory)
add_subdirectory(platform)
add_subdirectory(framework)

@ -1,7 +1,7 @@
# ddim lib
proto_library(framework_proto SRCS framework.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
cc_library(attribute SRCS attribute.cc DEPS framework_proto)
cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
@ -74,7 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table profiler feed_fetch_method)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
}
return val;
}
case proto::AttrType::LONG: {
return attr_desc.l();
}
default:
PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
}

@ -168,6 +168,32 @@ struct ExtractAttribute<bool> {
const std::string& attr_name_;
};
template <>
struct ExtractAttribute<int64_t> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
int64_t* operator()(Attribute& attr) const {
if (attr.type() == typeid(int)) { // NOLINT
int val = boost::get<int>(attr);
attr = static_cast<int64_t>(val);
} else if (attr.type() == typeid(float)) { // NOLINT
int val = boost::get<float>(attr);
attr = static_cast<int64_t>(val);
}
int64_t* attr_value = nullptr;
try {
attr_value = &boost::get<int64_t>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
attr_name_, attr.type().name());
}
return attr_value;
}
const std::string& attr_name_;
};
// check whether a certain attribute fit its limits
// an attribute can have more than one limits
template <typename T>

@ -75,7 +75,7 @@ std::vector<VarDesc *> BlockDesc::AllVars() const {
OpDesc *BlockDesc::AppendOp() {
need_update_ = true;
ops_.emplace_back(new OpDesc());
ops_.emplace_back(new OpDesc(this));
return ops_.back().get();
}
@ -86,7 +86,7 @@ void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
OpDesc *BlockDesc::PrependOp() {
need_update_ = true;
ops_.emplace_front(new OpDesc());
ops_.emplace_front(new OpDesc(this));
return ops_.front().get();
}
@ -153,7 +153,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
vars_[var_desc.name()].reset(new VarDesc(var_desc));
}
for (const proto::OpDesc &op_desc : desc_->ops()) {
ops_.emplace_back(new OpDesc(op_desc, prog));
ops_.emplace_back(new OpDesc(op_desc, prog, this));
}
}
@ -162,7 +162,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
: prog_(prog), desc_(desc) {
need_update_ = true;
for (auto &op : other.ops_) {
ops_.emplace_back(new OpDesc(*op));
ops_.emplace_back(new OpDesc(*op, this));
}
for (auto &it : other.vars_) {

@ -0,0 +1,87 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <mutex>
#include <queue>
namespace paddle {
namespace framework {
template <typename T>
class Channel {
public:
explicit Channel(std::size_t capacity) : capacity_(capacity) {}
void Send(T* channel_element) {
std::unique_lock<std::mutex> lock(mu_);
if (IsBounded()) {
full_cond_var_.wait(lock, [this]() {
bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
return capacity_valid;
});
}
channel_.push_back(std::move(*channel_element));
lock.unlock();
empty_cond_var_.notify_one();
}
T* Receive() {
std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
T* channel_element = std::move(channel_.front());
channel_.pop_front();
NotifyAllSenders(&lock);
return channel_element;
}
size_t Size() {
std::unique_lock<std::mutex> lock(mu_);
return channel_.size();
}
void Clear() {
std::unique_lock<std::mutex> lock(mu_);
channel_.clear();
NotifyAllSenders(&lock);
}
private:
std::size_t capacity_;
std::mutex mu_;
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::deque<T> channel_;
private:
void NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
if (IsBounded()) {
lock->unlock();
full_cond_var_.notify_one();
}
}
bool IsBounded() const { return capacity_ > 0; }
bool IsCapacityFull() const { return channel_.size() >= capacity_; }
};
} // namespace operator
} // namespace paddle

@ -17,11 +17,13 @@ limitations under the License. */
#include <set>
#include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_method.h"
#include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark);
DEFINE_bool(check_nan_inf, false,
@ -116,8 +118,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugStringEx(local_scope);
VLOG(4) << op->DebugStringEx(local_scope);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_));
op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_do_memory_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
@ -143,5 +150,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
}
}
// Check whether the block already has feed operators and feed_holder.
// Return false if the block does not have any feed operators.
// If some feed operators have been prepended to the block, check that
// the info contained in these feed operators matches the feed_targets
// and feed_holder_name. Raise exception when any mismatch is found.
// Return true if the block has feed operators and holder of matching info.
static bool has_feed_operators(
BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
const std::string& feed_holder_name) {
size_t feed_count = 0;
for (auto* op : block->AllOps()) {
if (op->Type() == kFeedOpType) {
feed_count++;
PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
"Input to feed op should be '%s'", feed_holder_name);
std::string feed_target_name = op->Output("Out")[0];
PADDLE_ENFORCE(
feed_targets.find(feed_target_name) != feed_targets.end(),
"Feed operator output name '%s' cannot be found in 'feed_targets'",
feed_target_name);
}
}
if (feed_count > 0) {
PADDLE_ENFORCE_EQ(
feed_count, feed_targets.size(),
"The number of feed operators should match 'feed_targets'");
// When feed operator are present, so should be feed_holder
auto var = block->FindVar(feed_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
feed_holder_name);
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
"'%s' variable should be 'FEED_MINIBATCH' type",
feed_holder_name);
}
return feed_count > 0;
}
// Check whether the block already has fetch operators and fetch_holder.
// Return false if the block does not have any fetch operators.
// If some fetch operators have been appended to the block, check that
// the info contained in these fetch operators matches the fetch_targets
// and fetch_holder_name. Raise exception when any mismatch is found.
// Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators(
BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& fetch_holder_name) {
size_t fetch_count = 0;
for (auto* op : block->AllOps()) {
if (op->Type() == kFetchOpType) {
fetch_count++;
PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
"Output of fetch op should be '%s'", fetch_holder_name);
std::string fetch_target_name = op->Input("X")[0];
PADDLE_ENFORCE(
fetch_targets.find(fetch_target_name) != fetch_targets.end(),
"Fetch operator input name '%s' cannot be found in 'fetch_targets'",
fetch_target_name);
}
}
if (fetch_count > 0) {
PADDLE_ENFORCE_EQ(
fetch_count, fetch_targets.size(),
"The number of fetch operators should match 'fetch_targets'");
// When fetch operator are present, so should be fetch_holder
auto var = block->FindVar(fetch_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
fetch_holder_name);
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
"'%s' variable should be 'FETCH_LIST' type",
fetch_holder_name);
}
return fetch_count > 0;
}
void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
auto* copy_program = new ProgramDesc(program);
auto* global_block = copy_program->MutableBlock(0);
if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
// create feed_holder variable
auto* feed_holder = global_block->Var(feed_holder_name);
feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
feed_holder->SetPersistable(true);
int i = 0;
for (auto& feed_target : feed_targets) {
std::string var_name = feed_target.first;
VLOG(3) << "feed target's name: " << var_name;
// prepend feed op
auto* op = global_block->PrependOp();
op->SetType(kFeedOpType);
op->SetInput("X", {feed_holder_name});
op->SetOutput("Out", {var_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
i++;
}
}
// map the data of feed_targets to feed_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
std::string feed_target_name = op->Output("Out")[0];
int idx = boost::get<int>(op->GetAttr("col"));
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
idx);
}
}
if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
// create fetch_holder variable
auto* fetch_holder = global_block->Var(fetch_holder_name);
fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
fetch_holder->SetPersistable(true);
int i = 0;
for (auto& fetch_target : fetch_targets) {
std::string var_name = fetch_target.first;
VLOG(3) << "fetch target's name: " << var_name;
// append fetch op
auto* op = global_block->AppendOp();
op->SetType(kFetchOpType);
op->SetInput("X", {var_name});
op->SetOutput("Out", {fetch_holder_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
i++;
}
}
Run(*copy_program, scope, 0, true, true);
// obtain the data of fetch_targets from fetch_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
GetFetchVariable(*scope, fetch_holder_name, idx);
}
}
delete copy_program;
}
} // namespace framework
} // namespace paddle

@ -41,6 +41,12 @@ class Executor {
void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
bool create_vars = true);
void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
private:
const platform::Place place_;
};

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save