Merge pull request #8 from reyoung/parallel_do

Parallel do
cross_channel_norm
Yang Yang(Tony) 7 years ago committed by GitHub
commit 01560660a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -20,8 +20,10 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
include(system)
project(paddle CXX C Go)
message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING)

@ -19,7 +19,7 @@ ExternalProject_Add(
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
add_library(eigen3 STATIC ${dummyfile})
else()
add_library(eigen3 INTERFACE)

@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND})
CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID)
# arm_soft_fp_abi branch of OpenBLAS to support softfp
# https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
# use softfp
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
@ -113,7 +109,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})

@ -120,7 +120,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs})
# Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
@ -160,7 +160,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs} ${target_OBJS})
# Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
@ -324,7 +324,7 @@ function(go_library TARGET_NAME)
)
# Add dummy code to support `make target_name` under Terminal Command
file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
if (go_library_SHARED OR go_library_shared)
add_library(${TARGET_NAME} SHARED ${dummyfile})
else()

@ -307,6 +307,12 @@ sequence_expand
:noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

@ -0,0 +1,217 @@
# Memory Optimization
## Problem
In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
- availability of Big Data
- supercomputing power to process this Big Data over very large neural networks
- modern algorithms
Following graph shows the details:
![](images/deep_learning.png)
Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference.
## Solution
### Basic Strategy
There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
#### In-place Operation
In a relu activation operator
$y = \max(x, 0)$
If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
#### Memory Sharing
Not all operators support in-place operations. Memory sharing is a more general strategy.
Following is an example:
```
a = op1(b, c);
d = op2(a)
e = op3(d, f)
```
In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
### Live Variable Analysis
It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
- construct a control flow graph
- solve the dataflow equations
#### Control Flow Graph
To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
Following is the flow graph for a simple loop.
![](images/control_flow_graph.png)
#### Dataflow Analysis
liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
- Flow Graph Terminology
A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
- Uses and Defs
An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
- Liveness
A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
![](images/dataflow_equations.png)
### Memory optimization transpiler
At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
#### add in-place attribute
In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
#### contruct control flow graph
Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
- Block0:
```
lookup_table
mul
...
while(sub-block idx 1)
...
array_to_lod_tensor
cross_entropy
...
while_grad(sub-block idx 2)
read_from_array
array_to_lod_tensor
...
```
- Block1
```
read_from_array
read_from_array
...
write_to_array
increment
write_to_array
less_than
```
- Block2
```
read_from_array
increment
...
write_to_array
write_to_array
```
We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
```python
class ControlFlowGraph(object):
def __init__(self, Program):
self._sucessors = defaultdict(set)
self._presucessors = defaultdict(set)
self._uses = defaultdict(set)
self._defs = defaultdict(set)
self._live_in = defaultdict(set)
self._live_out = defaultdict(set)
self._program = Program
def build(self):
pass
def dataflow_analysis(self):
pass
def memory_optimization(self):
pass
def get_program(self):
return self._program
```
#### make dataflow analysis
We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
For example:
```
a = op1(b, c);
d = op2(a)
e = op3(d, f)
```
The dataflow analysis result is:
```
live_in(op1) = {b, c, f}
live_out(op1) = {a, f}
live_in(op2) = {a, f}
live_out(op2) = {d, f}
live_in(op3) = {d, f}
live_out(op3) = {}
```
After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
#### memory sharing policy
A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
```
if op.support_inplace():
i --> pool
pool --> o
else:
pool --> o
i --> pool
```
## Reference
- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
- Modern compiler implementation in ML, by Andrew W. Appel
- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)

@ -48,8 +48,8 @@ Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/dev
```
/-> CPUDeviceContext --> MKLDeviceContext
DeviceContext ----> CUDADeviceContext --> CUDNNDeviceContext
/-> CPUDeviceContext
DeviceContext ----> CUDADeviceContext
\-> FPGADeviceContext
```
@ -79,16 +79,6 @@ private:
};
```
- CUDNNDeviceContext
```
class CUDNNDeviceContext : public CUDADeviceContext {
private:
cudnnHandle_t cudnn_handle_;
};
```
### Memory and Tensor

@ -1,8 +1,9 @@
# Android平台编译指南
用户可通过如下两种方式交叉编译Android平台上适用的PaddlePaddle库
- 基于Docker容器的编译方式
- 基于Linux交叉编译环境的编译方式
- [基于Docker容器的编译方式](#基于docker容器的编译方式)
- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
## 基于Docker容器的编译方式
Docker能在所有主要操作系统包括LinuxMac OS X和Windows上运行因此使用基于Docker容器的编译方式用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
@ -16,6 +17,12 @@ $ cd Paddle
$ docker build -t username/paddle-android:dev . -f Dockerfile.android
```
用户也可以使用PaddlePaddle提供的官方开发镜像
```bash
$ docker pull paddlepaddle/paddle:latest-dev-android
```
### 编译PaddlePaddle C-API库
构建好开发镜像后即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数
@ -41,23 +48,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>>= 16</td>
<td>21</td>
</tr>
</tbody>
</table>
- 编译`armeabi-v7a``Android API 21`的PaddlePaddle库
```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
```
- 编译`arm64-v8a``Android API 21`的PaddlePaddle库
```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
```
执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`Docker使`Android API 21`****DockerPaddlePaddleC-API`$PWD/install_android``$PWD/install_android/third_party`
执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`Docker使`Android API 21`[](#)DockerPaddlePaddleC-API`$PWD/install_android``$PWD/install_android/third_party`
## 基于Linux交叉编译环境的编译方式
本文档将以Linux x86-64平台为例介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
@ -83,6 +92,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链面向架构为32位ARM架构支持的最小的Android API级别为21支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
- 构建`arm64-v8a`、 `Android API 21`的独立工具链:
```bash
your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
--arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
@ -90,14 +100,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链面向架构为64位ARM64架构支持的最小Android API级别为21支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
注意:**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
### 配置交叉编译参数
CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)以提供一些默认的编译器和编译参数相关配置。注意从CMake 3.7版本开始CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时将会将用户传进来的配置参数传递CMake系统交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
交叉编译Android版本的PaddlePaddle库时有一些必须配置的参数
- `CMAKE_SYSTEM_NAME`CMake编译的目标平台必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外还会强制设置一些PaddlePaddle参数的值`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。
- `CMAKE_SYSTEM_NAME`CMake编译的目标平台必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本并自动编译PaddlePaddle所需的所有第三方库。此外还会强制设置一些PaddlePaddle参数的值`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`)。
- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。
- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
@ -119,7 +127,7 @@ Android平台可选配置参数
其他配置参数:
- `USE_EIGEN_FOR_BLAS`是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`。
- `HOST_C/CXX_COMPILER`宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。
- `HOST_C/CXX_COMPILER`宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。
常用的cmake配置如下
@ -147,9 +155,10 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
..
```
用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
**性能TIPS**为了达到最快的计算速度在CMake参数配置上有以下建议
- 设置`CMAKE_BUILD_TYPE`为`Release`
- 使用`clang`编译工具链
- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`使用Eigen进行矩阵计算`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`使用OpenBLAS进行矩阵计算

@ -1,6 +1,9 @@
# Build PaddlePaddle for Android
There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker.
There are two approaches to build PaddlePaddle for Android:
- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
- [Cross-Compiling on Linux](#cross-compiling-on-linux)
## Cross-Compiling Using Docker
@ -16,6 +19,12 @@ $ cd Paddle
$ docker build -t paddle:dev-android . -f Dockerfile.android
```
Users can directly use the published Docker image.
```bash
$ docker pull paddlepaddle/paddle:latest-dev-android
```
### Build the Inference Library
We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@ -47,7 +56,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>>= 16</td>
<td>21</td>
</tr>
</tbody>
@ -93,15 +102,13 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
### Cross-Compiling Arguments
CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
Some other CMake arguments you need to know:
- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
@ -123,7 +130,7 @@ Some Android-specific arguments:
Other useful arguments:
- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`.
- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`.
- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
Some frequent configurations for your reference:
@ -158,6 +165,7 @@ There are some other arguments you might want to configure.
- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
- `CMAKE_BUILD_TYPE=Release`
- `ANDROID_TOOLCHAIN=clang`
- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.

@ -1,4 +1,4 @@
# PaddlePaddle Compiling Guide for iOS
# Build PaddlePaddle for iOS
This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
@ -98,7 +98,7 @@ You can set other compiling parameters for your own need. I.E. if you are trying
- set `CMAKE_BUILD_TYPE` with `Release`
- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
## Compile and install
## Build and install
After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
@ -109,7 +109,7 @@ $ make install
Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
`your/path/to/install` directory will have following directories after `compile` and `install`:
`your/path/to/install` directory will have following directories after `make install`:
- `include`, contains all the C-API header files.
- `lib`, contains PaddlePaddle C-API static library.

@ -24,6 +24,7 @@ else()
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(inference)
endif()
if(WITH_SWIG_PY)

@ -26,7 +26,10 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_test(variable_test SRCS variable_test.cc)
cc_library(scope SRCS scope.cc DEPS glog)
cc_library(threadpool SRCS threadpool.cc)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool)
cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
@ -70,8 +73,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_library(threadpool SRCS threadpool.cc)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
cc_test(init_test SRCS init_test.cc DEPS init)

@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <functional>
#include "paddle/framework/data_transform.h"
#include "paddle/framework/lod_tensor.h"
@ -74,26 +75,28 @@ void TransDataType(const platform::DeviceContext* ctx,
}
}
void TransDataLayout(const platform::DeviceContext* ctx,
void TransDataLayout(const std::vector<int>& axis,
const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataType Only Support DataType transform on same place!");
"TransDataLayout only support DataLayout transform on same place!");
PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
"TransDataLayout only support Datatype are same!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
auto src_dim = src.dims();
dst->Resize(src_dim);
auto place = kernel_pair.second.place_;
CopyFrom(src, place, *ctx, dst);
const std::vector<int> axis = {0, 2, 3, 1};
auto src_dim = src.dims();
std::vector<int64_t> dst_dim;
dst_dim.resize(axis.size());
for (size_t i = 0; i < axis.size(); i++) {
dst_dim[i] = src_dim[axis[i]];
@ -102,7 +105,7 @@ void TransDataLayout(const platform::DeviceContext* ctx,
dst->Resize(make_ddim(dst_dim));
auto src_type = kernel_pair.first.data_type_;
framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis));
framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
dst->set_layout(kernel_pair.second.data_layout_);
}
@ -111,5 +114,22 @@ void TransDataLayout(const platform::DeviceContext* ctx,
} // namespace paddle
namespace f = paddle::framework;
namespace {
std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
}
REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout);
REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
std::bind(f::TransDataLayout, NHWC2NCHW,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));
REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
std::bind(f::TransDataLayout, NCHW2NHWC,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));

@ -73,6 +73,7 @@ struct CastDataType {
auto numel = in_.numel();
auto* in_end = in_begin + numel;
auto* out_begin = out_->mutable_data<OutType>(place);
if (platform::is_cpu_place(place)) {
platform::Transform<platform::CPUDeviceContext> trans;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
@ -86,9 +87,9 @@ struct CastDataType {
};
struct CastDataLayout {
CastDataLayout(const framework::Tensor& in, framework::Tensor* out,
const platform::DeviceContext* ctx,
const std::vector<int>& axis)
CastDataLayout(const platform::DeviceContext* ctx,
const std::vector<int>& axis, const framework::Tensor& in,
framework::Tensor* out)
: in_(in), out_(out), ctx_(ctx), axis_(axis) {}
const framework::Tensor in_;
framework::Tensor* out_;
@ -98,6 +99,7 @@ struct CastDataLayout {
template <typename T>
void operator()() {
auto place = ctx_->GetPlace();
if (platform::is_cpu_place(place)) {
operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);

@ -106,7 +106,7 @@ TEST(DataTransform, Register) {
ASSERT_EQ(test_value, 2);
}
TEST(DataTransform, Layout) {
TEST(DataTransform, DataLayout) {
using namespace paddle::framework;
using namespace paddle::platform;
@ -127,7 +127,19 @@ TEST(DataTransform, Layout) {
}
Tensor dst = out.Get<Tensor>();
EXPECT_TRUE(dst.layout() != src->layout());
EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
{
auto kernel1 = GenFromBit({1, 0, 1, 0});
auto kernel2 = GenFromBit({1, 0, 0, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, out, &in);
}
EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
}
TEST(DataTransform, DataType) {

@ -75,5 +75,10 @@ bool InitDevices(const std::vector<std::string> &devices) {
return true;
}
void InitGLOG(const std::string &prog_name) {
google::InitGoogleLogging(prog_name.c_str());
google::InstallFailureSignalHandler();
}
} // namespace framework
} // namespace paddle

@ -22,6 +22,8 @@ namespace framework {
void InitGflags(std::vector<std::string> &argv);
void InitGLOG(const std::string &prog_name);
bool InitDevices(const std::vector<std::string> &devices);
} // namespace framework

@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cctype>
namespace paddle {
namespace framework {
@ -41,6 +42,9 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
inline LibraryType StringToLibraryType(const char* ctype) {
std::string s(ctype);
for (size_t i = 0; i < s.size(); ++i) {
s[i] = toupper(s[i]);
}
if (s == std::string("PLAIN")) {
return LibraryType::kPlain;
} else if (s == std::string("MKLDNN")) {

@ -193,6 +193,9 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod.");
if (lod->empty()) {
for (size_t i = 0; i < lod_length.size(); ++i) {
lod->emplace_back(1, 0); // size = 1, value = 0;
}
*lod = LoD(lod_length.size(), std::vector<size_t>({0}));
}
for (size_t i = 0; i < lod->size(); ++i) {
@ -230,9 +233,10 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
}
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
const platform::DeviceContext &dev_ctx) {
{
// the 1st field, unit32_t version for SelectedRows
// the 1st field, unit32_t version for LoDTensor
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
@ -253,7 +257,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
}
}
// the 3st filed, Tensor
DeserializeFromStream(is, static_cast<Tensor *>(tensor));
DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
}
std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
@ -266,10 +270,10 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
"Batch size should be divided by places size");
std::vector<LoDTensor> lods;
for (int place_idx = 0; place_idx < places.size(); ++place_idx) {
int begin = place_idx * dims()[0] / places.size();
int end = (place_idx + 1) * dims()[0] / places.size();
auto src = Slice(begin, end);
for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
size_t begin = place_idx * dims()[0] / places.size();
size_t end = (place_idx + 1) * dims()[0] / places.size();
auto src = Slice(static_cast<int>(begin), static_cast<int>(end));
LoDTensor dst;
dst.Resize(src.dims());

@ -215,7 +215,8 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
*/
void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
const platform::DeviceContext& dev_ctx);
} // namespace framework
} // namespace paddle

@ -132,7 +132,7 @@ TEST_F(LoDTensorTester, SerializeAndDeserialize) {
std::ostringstream oss;
SerializeToStream(oss, lod_tensor_, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < kLodTensorSize; ++i) {
EXPECT_EQ(dst_ptr[i], i);

@ -64,7 +64,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j,
out);
out_var->SetLoDLevel(in_var->GetLodLevel());
out_var->SetLoDLevel(in_var->GetLoDLevel());
}
bool IsRuntime() const override;
@ -260,7 +260,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
void operator()(int v) const { attr_->set_i(v); }
void operator()(float v) const { attr_->set_f(v); }
void operator()(const std::string &v) const { attr_->set_s(v); }
void operator()(bool b) const { attr_->set_b(b); }
// Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
template <class T,
class = typename std::enable_if<std::is_same<bool, T>::value>::type>
void operator()(T b) const {
attr_->set_b(b);
}
void operator()(const std::vector<int> &v) const {
VectorToRepeated(v, attr_->mutable_ints());
@ -274,9 +280,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
void operator()(const std::vector<bool> &v) const {
VectorToRepeated(v, attr_->mutable_bools());
}
void operator()(proto::BlockDesc *desc) const {
attr_->set_block_idx(desc->idx());
}
void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
};

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save