Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cpp_parallel_executor
@ -0,0 +1,43 @@
|
||||
if(NOT WITH_AMD_GPU)
|
||||
return()
|
||||
endif()
|
||||
|
||||
include_directories("/opt/rocm/include")
|
||||
include_directories("/opt/rocm/hipblas/include")
|
||||
include_directories("/opt/rocm/hiprand/include")
|
||||
include_directories("/opt/rocm/rocrand/include")
|
||||
include_directories("/opt/rocm/rccl/include")
|
||||
include_directories("/opt/rocm/thrust")
|
||||
|
||||
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
|
||||
|
||||
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
|
||||
|
||||
if(WITH_DSO)
|
||||
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
|
||||
endif(WITH_DSO)
|
||||
|
||||
if(WITH_DOUBLE)
|
||||
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
|
||||
endif(WITH_DOUBLE)
|
||||
|
||||
if(WITH_TESTING)
|
||||
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
|
||||
endif(WITH_TESTING)
|
||||
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
||||
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
|
||||
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
|
||||
endif()
|
||||
|
||||
if("x${HCC_HOME}" STREQUAL "x")
|
||||
set(HCC_HOME "/opt/rocm/hcc")
|
||||
endif()
|
||||
|
||||
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
|
||||
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
|
||||
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
|
||||
|
@ -1 +1,2 @@
|
||||
add_subdirectory(v2)
|
||||
add_subdirectory(fluid)
|
||||
|
@ -0,0 +1,49 @@
|
||||
if(NOT DEFINED SPHINX_THEME)
|
||||
set(SPHINX_THEME default)
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED SPHINX_THEME_DIR)
|
||||
set(SPHINX_THEME_DIR)
|
||||
endif()
|
||||
|
||||
# configured documentation tools and intermediate build results
|
||||
set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
|
||||
|
||||
# Sphinx cache with pickled ReST documents
|
||||
set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
|
||||
|
||||
# HTML output director
|
||||
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
|
||||
|
||||
configure_file(
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
|
||||
"${BINARY_BUILD_DIR_EN}/conf.py"
|
||||
@ONLY)
|
||||
|
||||
sphinx_add_target(paddle_fluid_docs
|
||||
html
|
||||
${BINARY_BUILD_DIR_EN}
|
||||
${SPHINX_CACHE_DIR_EN}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${SPHINX_HTML_DIR_EN})
|
||||
|
||||
# configured documentation tools and intermediate build results
|
||||
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
|
||||
|
||||
# Sphinx cache with pickled ReST documents
|
||||
set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
|
||||
|
||||
# HTML output directory
|
||||
set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
|
||||
|
||||
configure_file(
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
|
||||
"${BINARY_BUILD_DIR_CN}/conf.py"
|
||||
@ONLY)
|
||||
|
||||
sphinx_add_target(paddle_fluid_docs_cn
|
||||
html
|
||||
${BINARY_BUILD_DIR_CN}
|
||||
${SPHINX_CACHE_DIR_CN}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${SPHINX_HTML_DIR_CN})
|
@ -0,0 +1,2 @@
|
||||
安装与使用
|
||||
------------
|
@ -0,0 +1,2 @@
|
||||
Build and Install
|
||||
------------
|
Before Width: | Height: | Size: 160 KiB After Width: | Height: | Size: 160 KiB |
Before Width: | Height: | Size: 347 KiB After Width: | Height: | Size: 347 KiB |
After Width: | Height: | Size: 99 KiB |
@ -0,0 +1,128 @@
|
||||
## Design Doc: Distributed Lookup Table Operator
|
||||
|
||||
A lookup table operator in PaddlePaddle where the table could be out
|
||||
of the memory of a computer.
|
||||
|
||||
## Background
|
||||
|
||||
A lookup table operator is well-used in deep learning for learning the
|
||||
representation, or the
|
||||
[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
|
||||
symbols.
|
||||
|
||||
### The Forward Algorithm
|
||||
|
||||
The forward algorithm of the lookup table is a multiplication of the
|
||||
input vector x and the lookup table matrix W:
|
||||
|
||||
$$y = x * W$$
|
||||
|
||||
When x is a sparse vector of symbols, the above multiplication
|
||||
simplifies into looking up rows in W that correspond to symbols in x,
|
||||
denoted by W(x). Please be aware that W could be huge and out of the
|
||||
memory, so we'd need a distributed storage service, which supports the
|
||||
lookup of rows.
|
||||
|
||||
The following figure illustrates the multiplication of x with two
|
||||
non-zero elements, or say, two symbols, and a lookup table W:
|
||||
|
||||
![lookup table](./src/lookup_table.png)
|
||||
|
||||
### The Backward Algorithm
|
||||
|
||||
The backward algorithm computes W'(x) using W(x). W'(x) has the same
|
||||
scale of size as W(x) and is much smaller than W.
|
||||
|
||||
To optimize W given W', we can do simple SGD update:
|
||||
|
||||
$$W = f(W') = \lambda * W'$$
|
||||
|
||||
or some more sophisticated algorithms that rely on both W' and W:
|
||||
|
||||
$$W = f(W, W')$$
|
||||
|
||||
The following figure illustrates the backward pass of the lookup
|
||||
operator: ![lookup table training](./src/lookup_table_training.png)
|
||||
|
||||
## Distributed Storage Service
|
||||
|
||||
The forward algorithm requires a distributed storage service for W.
|
||||
The backward algorithm prefers that the storage system can apply the
|
||||
optimization algorithm on W. The following two sections describe two
|
||||
solutions -- the former doesn't require that the storage service can
|
||||
do optimization, the latter does.
|
||||
|
||||
### Storage Service Doesn't Optimize
|
||||
|
||||
In this design, we use highly-optimized distributed storage, e.g.,
|
||||
memcached, as the storage service, and we run the optimization
|
||||
algorithm on parameter servers of PaddlePaddle. The following figure
|
||||
illustrates the training process.
|
||||
|
||||
<!--
|
||||
Note: please update the following URL when update this digraph.
|
||||
<img src='https://g.gravizo.com/svg?
|
||||
digraph G {
|
||||
rankdir="LR";
|
||||
subgraph cluster1 {
|
||||
P1 [label="pserver 1"];
|
||||
P2 [label="pserver 2"];
|
||||
T1 [label="trainer 1"];
|
||||
T2 [label="trainer 2"];
|
||||
T3 [label="trainer 3"];
|
||||
}
|
||||
KV [label="memcached"];
|
||||
T1 -> P1;
|
||||
T1 -> P2;
|
||||
T2 -> P1;
|
||||
T2 -> P2;
|
||||
T3 -> P1;
|
||||
T3 -> P2;
|
||||
P1 -> KV [color=gray, weight=0.1];
|
||||
KV -> P1 [color=gray, weight=0.1];
|
||||
P2 -> KV [color=gray, weight=0.1];
|
||||
KV -> P2 [color=gray, weight=0.1];
|
||||
KV -> T1 [color=gray, weight=0.1];
|
||||
KV -> T2 [color=gray, weight=0.1];
|
||||
KV -> T3 [color=gray, weight=0.1];
|
||||
}
|
||||
)
|
||||
'/>
|
||||
-->
|
||||
|
||||
<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
|
||||
|
||||
Each trainer runs the forward and backward passes using their local
|
||||
data:
|
||||
|
||||
1. In the forward pass, when a trainer runs the forward algorithm of a
|
||||
lookup operator, it retrieves W(x) from the storage service.
|
||||
1. The trainer computes W'(x) in the backward pass using W(x).
|
||||
|
||||
During the global update process:
|
||||
|
||||
1. Each trainer uploads its W'(x) to parameter servers.
|
||||
1. The parameter server runs the optimization algorithm, e.g., the
|
||||
Adam optimization algorithm, which requires that
|
||||
1. The parameter server retrieves W(x) from memcached, and
|
||||
1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
|
||||
W'(x))$ to memcached, where $f$ denotes the optimization
|
||||
algorithm.
|
||||
|
||||
### Storage Service Does Optimize
|
||||
|
||||
This design is very similar to the above one, except that the
|
||||
optimization algorithm $f$ runs on the storage service.
|
||||
|
||||
- Pro: parameter servers do not retrieve W(x) from the storage
|
||||
service, thus saves half network communication.
|
||||
- Con: the storage service needs to be able to run the optimization
|
||||
algorithm.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Let us do the "storage service does not optimize" solution first, as a
|
||||
baseline at least, because it is easier to use a well-optimized
|
||||
distributed storage service like memcached. We can do the "storage
|
||||
service does optimize" solution later or at the same time, which, if
|
||||
implemented carefully, should have better performance than the former.
|
After Width: | Height: | Size: 24 KiB |
After Width: | Height: | Size: 88 KiB |
@ -0,0 +1,2 @@
|
||||
设计思想
|
||||
------------
|
@ -0,0 +1,2 @@
|
||||
Design
|
||||
------------
|
@ -0,0 +1,2 @@
|
||||
开发标准
|
||||
------------
|
@ -0,0 +1,4 @@
|
||||
Development
|
||||
------------
|
||||
|
||||
This is Development page
|