Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_avg
commit
d1a7b47e04
@ -0,0 +1,30 @@
|
|||||||
|
INCLUDE(ExternalProject)
|
||||||
|
|
||||||
|
SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
|
||||||
|
SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
|
||||||
|
INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
ExternalProject_Add(
|
||||||
|
extern_threadpool
|
||||||
|
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||||
|
GIT_REPOSITORY "https://github.com/progschj/ThreadPool.git"
|
||||||
|
GIT_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040
|
||||||
|
PREFIX ${THREADPOOL_SOURCE_DIR}
|
||||||
|
UPDATE_COMMAND ""
|
||||||
|
CONFIGURE_COMMAND ""
|
||||||
|
BUILD_COMMAND ""
|
||||||
|
INSTALL_COMMAND ""
|
||||||
|
TEST_COMMAND ""
|
||||||
|
)
|
||||||
|
|
||||||
|
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
|
||||||
|
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
|
||||||
|
file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
|
||||||
|
add_library(simple_threadpool STATIC ${dummyfile})
|
||||||
|
else()
|
||||||
|
add_library(simple_threadpool INTERFACE)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_dependencies(simple_threadpool extern_threadpool)
|
||||||
|
|
||||||
|
LIST(APPEND external_project_dependencies simple_threadpool)
|
@ -0,0 +1,83 @@
|
|||||||
|
digraph G {
|
||||||
|
subgraph cluster_init {
|
||||||
|
label="Initialization"
|
||||||
|
startup_program [label="startup", shape=box]
|
||||||
|
node_w_g0 [label="W\nGPU0"]
|
||||||
|
startup_program -> node_w_g0 [label="Initialize"]
|
||||||
|
node_w_g1 [label="W\nGPU1"]
|
||||||
|
node_w_g0 -> node_w_g1 [label="broadcast"]
|
||||||
|
}
|
||||||
|
|
||||||
|
subgraph cluster_train {
|
||||||
|
label="forward_backward"
|
||||||
|
|
||||||
|
subgraph cluster_gpu0 {
|
||||||
|
label="GPU0"
|
||||||
|
fc_0 [label="fc\nGPU0", shape=box]
|
||||||
|
hidden_0 [label="hidden\nGPU0"]
|
||||||
|
node_w_g0 -> fc_0
|
||||||
|
fc_0 -> hidden_0
|
||||||
|
loss0 [label="loss\nGPU0"]
|
||||||
|
hidden_0 -> loss0 [label="many ops omitted"]
|
||||||
|
scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
|
||||||
|
loss_g0 [label="loss_grad\nGPU0"]
|
||||||
|
scale_loss_0->loss_g0
|
||||||
|
|
||||||
|
fc_g_0 [label="w_grad\nGPU0", shape=box]
|
||||||
|
loss0 -> fc_g_0
|
||||||
|
loss_g0 -> fc_g_0
|
||||||
|
hidden_0 -> fc_g_0
|
||||||
|
}
|
||||||
|
|
||||||
|
subgraph cluster_gpu1 {
|
||||||
|
label="GPU1"
|
||||||
|
fc_1 [label="fc\nGPU1", shape=box]
|
||||||
|
hidden_1 [label="hidden\nGPU1"]
|
||||||
|
node_w_g1 -> fc_1
|
||||||
|
fc_1 -> hidden_1
|
||||||
|
loss1 [label="loss\nGPU1"]
|
||||||
|
hidden_1 -> loss1 [label="many ops omitted"]
|
||||||
|
scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
|
||||||
|
loss_g1 [label="loss_grad\nGPU1"]
|
||||||
|
scale_loss_1->loss_g1
|
||||||
|
|
||||||
|
fc_g_1 [label="w_grad\nGPU1", shape=box]
|
||||||
|
loss1 -> fc_g_1
|
||||||
|
loss_g1 -> fc_g_1
|
||||||
|
hidden_1 -> fc_g_1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
|
||||||
|
fc_g_0 -> all_reduce_w
|
||||||
|
fc_g_1 -> all_reduce_w
|
||||||
|
|
||||||
|
fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
|
||||||
|
fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
|
||||||
|
all_reduce_w -> fc_g_0_merged
|
||||||
|
all_reduce_w -> fc_g_1_merged
|
||||||
|
|
||||||
|
subgraph cluster_optimization {
|
||||||
|
label="Optimization"
|
||||||
|
subgraph cluster_opt_gpu0 {
|
||||||
|
label="GPU0"
|
||||||
|
sgd_0 [label="SGD Op\nGPU0", shape=box]
|
||||||
|
|
||||||
|
fc_g_0_merged -> sgd_0
|
||||||
|
node_w_g0 -> sgd_0
|
||||||
|
optimized_w_0 [label="Optimized W\nGPU0"]
|
||||||
|
sgd_0 -> optimized_w_0
|
||||||
|
}
|
||||||
|
subgraph cluster_opt_gpu1 {
|
||||||
|
label="GPU1"
|
||||||
|
sgd_1 [label="SGD Op\nGPU1", shape=box]
|
||||||
|
|
||||||
|
fc_g_1_merged -> sgd_1
|
||||||
|
node_w_g1 -> sgd_1
|
||||||
|
optimized_w_1 [label="Optimized W\nGPU0"]
|
||||||
|
sgd_1 -> optimized_w_1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
After Width: | Height: | Size: 175 KiB |
@ -0,0 +1,104 @@
|
|||||||
|
# ParallelExecutor
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
|
||||||
|
|
||||||
|
The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
|
||||||
|
|
||||||
|
We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs.
|
||||||
|
|
||||||
|
ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
|
||||||
|
|
||||||
|
|
||||||
|
## Overview of MultiGPUs logic
|
||||||
|
|
||||||
|
The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
|
||||||
|
|
||||||
|
![alt](images/parallel_executor_overview.png)
|
||||||
|
|
||||||
|
There are several optimizations for this logic.
|
||||||
|
|
||||||
|
1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
|
||||||
|
2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready.
|
||||||
|
* GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
|
||||||
|
* Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
|
||||||
|
3. The streams of computation, merge gradients and fetch data are different.
|
||||||
|
|
||||||
|
The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
|
||||||
|
|
||||||
|
| Number of GPUs | 1 | 2 | 3 | 4|
|
||||||
|
| --- | --- | --- | --- | --- |
|
||||||
|
| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
|
||||||
|
| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
|
||||||
|
|
||||||
|
|
||||||
|
## Static single assignment Graph
|
||||||
|
|
||||||
|
[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
|
||||||
|
|
||||||
|
The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
|
||||||
|
|
||||||
|
The data structure of `SSA` graph is:
|
||||||
|
|
||||||
|
```c++
|
||||||
|
struct VarHandleBase {
|
||||||
|
OpHandleBase* generated_op_;
|
||||||
|
vector<OpHandleBase*> pending_ops_;
|
||||||
|
|
||||||
|
string name;
|
||||||
|
Place place;
|
||||||
|
size_t version;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct OpHandleBase {
|
||||||
|
vector<OpHandleBase*> inputs_;
|
||||||
|
vector<OpHnadleBase*> outputs_;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SSAGraph {
|
||||||
|
// vars on each devices.
|
||||||
|
// * the vars in each map in vector is on different device.
|
||||||
|
// * the map is mapping a variable name to variable handles
|
||||||
|
// with different versions
|
||||||
|
vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
|
||||||
|
|
||||||
|
// All ops
|
||||||
|
vector<OpHandleBase> ops_;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
|
||||||
|
|
||||||
|
When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
|
||||||
|
|
||||||
|
## Execute SSA Graph
|
||||||
|
|
||||||
|
The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
|
||||||
|
|
||||||
|
1. Maintaining a map of an operator and its needed input number.
|
||||||
|
2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
|
||||||
|
3. If there is an operator which needed input number is decreased to zero, just run this operator.
|
||||||
|
4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
|
||||||
|
|
||||||
|
Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
|
||||||
|
|
||||||
|
## Synchronize GPU Kernels
|
||||||
|
|
||||||
|
The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
|
||||||
|
|
||||||
|
1. `OpHandle` will record `DeviceContext` that it is used.
|
||||||
|
2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
|
||||||
|
|
||||||
|
The `wait` are implemented by two strategies:
|
||||||
|
|
||||||
|
1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
|
||||||
|
2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
|
||||||
|
|
||||||
|
Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
|
||||||
|
|
||||||
|
## What's next?
|
||||||
|
|
||||||
|
* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
|
||||||
|
* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
|
||||||
|
* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
|
||||||
|
* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/build_from_source_cn.rst
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/build_from_source_en.rst
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/docker_install_cn.rst
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/docker_install_en.rst
|
@ -1,2 +0,0 @@
|
|||||||
安装与使用
|
|
||||||
------------
|
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/index_cn.rst
|
@ -1,2 +0,0 @@
|
|||||||
Build and Install
|
|
||||||
------------
|
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/index_en.rst
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/pip_install_cn.rst
|
@ -0,0 +1 @@
|
|||||||
|
../../v2/build_and_install/pip_install_en.rst
|
@ -0,0 +1,7 @@
|
|||||||
|
梯度更新算法
|
||||||
|
------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
parameter_average.md
|
@ -0,0 +1,7 @@
|
|||||||
|
Gradient Update Algorithm
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
parameter_average.md
|
@ -0,0 +1,18 @@
|
|||||||
|
核心概念
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
README.md
|
||||||
|
cpp_data_feeding.md
|
||||||
|
functions_operators_layers.md
|
||||||
|
program.md
|
||||||
|
variable.md
|
||||||
|
var_desc.md
|
||||||
|
tensor.md
|
||||||
|
tensor_array.md
|
||||||
|
lod_tensor.md
|
||||||
|
block.md
|
||||||
|
scope.md
|
||||||
|
executor.md
|
@ -0,0 +1,18 @@
|
|||||||
|
Core Concepts
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
README.md
|
||||||
|
cpp_data_feeding.md
|
||||||
|
functions_operators_layers.md
|
||||||
|
program.md
|
||||||
|
variable.md
|
||||||
|
var_desc.md
|
||||||
|
tensor.md
|
||||||
|
tensor_array.md
|
||||||
|
lod_tensor.md
|
||||||
|
block.md
|
||||||
|
scope.md
|
||||||
|
executor.md
|
@ -0,0 +1,139 @@
|
|||||||
|
# Channel Design
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
A Channel is a data structure that allows for synchronous interprocess
|
||||||
|
communication via message passing. It is a fundemental component of CSP
|
||||||
|
(communicating sequential processes), and allows for users to pass data
|
||||||
|
between threads without having to worry about synchronization.
|
||||||
|
|
||||||
|
## How to use it
|
||||||
|
|
||||||
|
Paddle offers python APIs to open and close channels, along with sending
|
||||||
|
and receiving data to/from a channel.
|
||||||
|
|
||||||
|
### Create a channel
|
||||||
|
|
||||||
|
Creates a new channel that takes in variables of a specific dtype.
|
||||||
|
|
||||||
|
- **fluid.make_channel(dtype, capacity=0)**
|
||||||
|
- **dtype**: The data type of variables being sent/received through channel
|
||||||
|
- **capacity**: The capacity of the channel. A capacity of 0 represents
|
||||||
|
an unbuffered channel. Capacity > 0 represents a buffered channel
|
||||||
|
|
||||||
|
```
|
||||||
|
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Close a channel
|
||||||
|
|
||||||
|
Closes a channel. Any pending senders and receivers will be awoken during
|
||||||
|
this time. Receivers can still receive from a closed channel, but senders
|
||||||
|
are not allowed to send any additional data to the channel (Paddle will
|
||||||
|
raise an exception if users try to send to a closed channel.)
|
||||||
|
|
||||||
|
- **fluid.channel_close(channel)**
|
||||||
|
|
||||||
|
```
|
||||||
|
fluid.channel_close(ch)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Send data to a channel
|
||||||
|
|
||||||
|
Sends a variable to a channel. Currently, variables of dtype `LoDTensor`,
|
||||||
|
`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
|
||||||
|
`ChannelHolder` are supported.
|
||||||
|
|
||||||
|
By default, the data of the Variable is moved from the sender to the receiver,
|
||||||
|
however the user can optionally copy the data before performing the send.
|
||||||
|
|
||||||
|
- **channel_send(channel, variable, is_copy=False)**
|
||||||
|
- **channel**: The channel to send the variable to
|
||||||
|
- **variable**: The variable to send to the channel
|
||||||
|
- **is_copy**: If set to True, channel_send will perform a variable assign
|
||||||
|
to copy the source variable to a new variable to be sent.
|
||||||
|
|
||||||
|
```
|
||||||
|
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
|
||||||
|
var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
|
||||||
|
fluid.channel_send(ch, var, True)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Receive data from a channel
|
||||||
|
|
||||||
|
Receives a variable from a channel. The data of the variable is moved to the
|
||||||
|
receiving variable.
|
||||||
|
|
||||||
|
- **channel_recv(channel, return_variable)**
|
||||||
|
- **channel**: The channel to receive the variable from
|
||||||
|
- **return_variable**: The destination variable used to store the data of the
|
||||||
|
variable received from the channel
|
||||||
|
|
||||||
|
```
|
||||||
|
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
|
||||||
|
var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
|
||||||
|
fluid.channel_recv(ch, var)
|
||||||
|
```
|
||||||
|
|
||||||
|
## How it Works
|
||||||
|
|
||||||
|
Channels provides a simple interface for different threads to share data.
|
||||||
|
To support the synchronization requirements, channels utilizes a series of
|
||||||
|
internal queues, locks, and conditional variables.
|
||||||
|
|
||||||
|
### QueueMessage
|
||||||
|
|
||||||
|
QueueMessage encapsulates the state of the channel send/receive operation to be
|
||||||
|
put in the **sendq/recvq**. It contains a condition variable used to lock the
|
||||||
|
thread (when there are no available sends/receives). In addition, it contains
|
||||||
|
a callback function to notify a thread when the QueueMessage is being
|
||||||
|
processed by the channel.
|
||||||
|
|
||||||
|
### Queues
|
||||||
|
|
||||||
|
- **buff_**: This queue holds the data buffer in a buffered channel. The
|
||||||
|
capacity is set to the capacity of the channel. This data buffer is not
|
||||||
|
used in an unbuffered channel.
|
||||||
|
|
||||||
|
- **sendq**: This queue holds the QueueMessage of any pending senders of a
|
||||||
|
channel. When a thread performs a channel_send operation on the channel, the
|
||||||
|
channel_send operation will put a new QueueMessage on the sendq and block the
|
||||||
|
current thread under two conditions:
|
||||||
|
1. The channel is buffered and is full
|
||||||
|
2. The channel is unbuffered and does not have a receiver
|
||||||
|
|
||||||
|
- **recvq**: This queue holds the QueueMessage of any pending receivers of a
|
||||||
|
channel. When a thread performs a channel_recv operation on the channel, the
|
||||||
|
channel_recv operation will put a new QueueMessage on the recvq and block the
|
||||||
|
current thread under two conditions:
|
||||||
|
1. The channel is buffered and there is no data on the buff_
|
||||||
|
2. The channel is unbuffered and does not have a sender
|
||||||
|
|
||||||
|
### State diagram
|
||||||
|
|
||||||
|
#### Channel Send
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="./images/channel_send.png"/><br/>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
#### Channel Receive
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="./images/channel_recv.png"/><br/>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
## Limitations and Considerations
|
||||||
|
|
||||||
|
### Variable Copy
|
||||||
|
|
||||||
|
In golang, variables in channels are copied from the sender to the receiver.
|
||||||
|
In Paddle, the data from our variables are **moved** from sender to receiver.
|
||||||
|
As a result, these variables should not be used after they are sent. We
|
||||||
|
provide a flag in channel_send method to allow users to copy the variable to
|
||||||
|
be sent before it is sent.
|
||||||
|
|
||||||
|
Please note that this is acheived by adding an **assign** operator and creating
|
||||||
|
a temporary variable that is sent in place of the original variable. Please
|
||||||
|
note that **assign** operator has limited support for only certain variables
|
||||||
|
datatypes.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue