Merge branch 'develop' into crf

7 years ago · c8d0d37c4d
parent 80a5ee0052 c91de280d7
commit c8d0d37c4d
187 changed files with 6626 additions and 1391 deletions
--- a/2
+++ b/2
@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@ -125,3 +125,8 @@ simple_attention
    :members: simple_attention
    :noindex:
 dot_product_attention
 ---------------------
 ..  automodule:: paddle.v2.networks
    :members: dot_product_attention
    :noindex:
--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -189,7 +189,7 @@ OpDesc {
  inputs = {0} // the index of x in vars of BlockDesc above
  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
  attrs {
-    "memories" : {1} // the index of h
+    "states" : {1} // the index of h
    "step_net" : <above step net>
  }
 };
--- a/doc/design/cluster_train/src/trainer.graffle
+++ b/doc/design/cluster_train/src/trainer.graffle
--- a/doc/design/images/feed_forward.png
+++ b/doc/design/images/feed_forward.png
--- a/doc/design/images/feed_forward_regularized.png
+++ b/doc/design/images/feed_forward_regularized.png
--- a/doc/design/images/l1_regularization.png
+++ b/doc/design/images/l1_regularization.png
--- a/doc/design/images/l2_regularization.png
+++ b/doc/design/images/l2_regularization.png
--- a/doc/design/images/loss_equation.png
+++ b/doc/design/images/loss_equation.png
--- a/doc/design/prune.md
+++ b/doc/design/prune.md
@ -0,0 +1,63 @@
 # Prune
 ## Motivation
 We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement 
 `void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
 and generate a pruned `ProgramDesc`.
 ## Challenge
 Pruning need to support both variables and operators being evaluation targets. Consider the following
 different situations.
 ```python
 # Case 1: run foward pass.
 cost_np = session.run(target=cost)
 # Case 2: run backward passing.
 opts_np, _ = session.run(target=[cost, opt])
 # Case 3: run checkpointing
 _ = session.run(target=checkpoint)
 ```
 ## Solution
 To support evaluation of operators, we add `is_target` field in the `OpDesc`.
 ```c++
 message OpDesc {
  required string type = 3;
  repeated Var inputs = 1;
  repeated Var outputs = 2;
  repeated Attr attrs = 4;
  optional bool is_target = 5 [ default = false ];
 };
 ```
 To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
 For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
 `fetch_op`'s input. Then we also set `fetch_op` is a target.
 ### Algorithm
 If an operator needs to be run, it must fall into one of the following cases:
 1. It is the target.
 2. It is depended by some other ops, meaning its output is some other op's input.
 The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
 ```c++
 bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
  for (auto& var : op_desc.outputs()) {
    for (auto& argu : var.arguments()) {
      if (dependent_vars.count(argu) != 0) {
        return true;
      }
    }
  }
  return false;
 }
 ```
 Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
@ -177,9 +177,6 @@ REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
 REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
 ### USE Macros
 Make sure the registration process is executed and linked.
 ---
 # Registration Process
 1. Write an Op class and its gradient Op class, if required.
@ -188,8 +185,6 @@ Make sure the registration process is executed and linked.
 	1. Call maker class to complete `proto` and `checker`
 	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
 4. Invoke the `USE` macro in which the Op is used to make sure that it is linked.
 ---
 # Backward Module (1/2)
 ### Create Backward Operator
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@ -3,17 +3,17 @@
 ## The Problem Posed
-Currently, for each C++ operator class definition, there registers a *gradient operator creator* function, which takes a C++ operator instance and returns the corresponding gradient operator instance.
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
-However, we noticed two problems with the current deisgn:
+However, we noticed two problems with the current design:
-1. As we decided to separate the *compilation* and *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
-1. Some operator's gradient computation requires more than one gradient operators.  For example, the gradient of *minus* consists of two operators -- an identity operaotr and a scale operator.  So we need to make the registration mechanism to support the mapping from an operator to a set of operators for gradient computation.
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
 ## The Current Implementation
-The C++ class `OpInfos` store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
 ```cpp
 struct OpInfo {
@ -31,16 +31,16 @@ OperatorBase* CreateGradientOperator(const OperatorBase& op) {
 ## Proposed Solution
-The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
 ```cpp
 // (OpDesc) --> vector<OpDesc>
 std::function<std::vector<OpDescBind>(const OpDescBind&)>;
 ```
-The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
-The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
 ```cpp
 struct OpInfo {
@ -49,7 +49,7 @@ struct OpInfo {
 };
 ```
-The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
 We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
@ -74,7 +74,7 @@ func = [] (const OpDescBind& fwd_op) {
 We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
-We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
 The user interface should be
--- a/doc/design/regularization.md
+++ b/doc/design/regularization.md
@ -0,0 +1,103 @@
 # Regularization in PaddlePaddle
 ## Introduction to Regularization
 A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. Many strategies are used by machine learning practitioners to reduce the test error, possibly at the expense of increased training error. These strategies are collectively known as **regularization**. 
 ### Parameter Norm Penalties
 Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
 <img src="./images/loss_equation.png" align="center"/><br/>
 The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
 The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
 ##### L2 Regularization:
 <img src="./images/l2_regularization.png" align="center"/><br/>
 ##### L1 Regularization
 <img src="./images/l1_regularization.png" align="center"/><br/>
 A much more detailed mathematical background of reguilarization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
 ## How to do Regularization in PaddlePaddle
 On surveying existing frameworks like Tensorflow, PyTorch, Caffe, etc, it can be seen that there are 2 common approaches of doing regularization:
 1. Making regularization a part of the optimizer using an attribute like `weight_decay` that is used to control the scale of the L2 Penalty. This approach is used in PyTorch as follows:
 	```python
 	opt =  torch.optim.SGD(params, lr=0.2, weight_decay=0.2)
 	```
    At every optimization step, this code will add the gradient of the L2 Norm of the params to the gradient of the params with respect to the loss function. This can seen in the following code snippet:
    ```python
    if weight_decay != 0:
    	d_p.add_(weight_decay, p.data)
    ```
    This is a very restyrictive way of doing regularization and does not give the users enough flexibility. 
    **Advantages**:
    -  It is easy to implement for us.
    -  Faster execution of backward. However, it can be done manually by advanced users too.
 	**Disadvantages**:
    - Not flexible for other regularizations such as L1/L0 regularization.
    - Does not allow for different regularization coefficient for different parameters. For example, in most models, ony the weight matrices are regularized and the bias vectors are unregularized.
    - Tightly coupled optimizer and regularization implementation. 
 2. Adding regularization ops to the graph through Python API. This approach is used by Tensorflow and Caffe. Using this approach, we manually add regularization ops to the graph and then add the regularization loss to the final loss function before sending them to the optimizer.
 	**Advantages**:
    - Allows for greater flexibility to the users of Paddle. Using this approach, the users can put different regularization to different parameters and also choose parameters that are not a part of regularization.
    - Makes it easy for the users to customize and extend the framework. 
 	**Disadvantages**:
    - Implementation requires comprehensive design and time. 
 ## Proposal for Regularization in PaddlePaddle
 ### Low-Level implementation
 In the new design, we propose to create new operations for regularization. For now, we can add 2 ops thgat correspond to the most frequently used regularizations:
 - L2_regularization_op
 - L1_regularization_op
 These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate Cpu and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes. other than L1 and L2 norm penalties. 
 The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
 ### Computation Graph
 Below is an example of a really simple feed forward neural network.
 <img src="./images/feed_forward.png" align="center"/><br/>
 The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
 <img src="./images/feed_forward_regularized.png" align="center"/><br/>
 ### Python API implementation for Regularization
 Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
 #### Creation of Regularization ops
 There are two possibilities for creating the regularization ops:
 1. We create these ops immediately while building the computation graph. 
 2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
 The proposal is to add these ops in a lazy manner just before the backward pass. 
 #### Storage of Regularization attributes
 Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
 #### High-level API
 In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we lso need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
--- a/doc/design/selected_rows.md
+++ b/doc/design/selected_rows.md
@ -1,6 +1,6 @@
 # Design Doc: Selected Rows
-`SelectedRows` is a kind of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in that tensor. It is straightforward to represent the sparse tensor by the following sparse tensor data structure:
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
 ```cpp
 class SelectedRows {
@ -11,7 +11,7 @@ class SelectedRows {
 };
 ```
-The field `height_` shows the first dimension of `SelectedRows`. The `rows` are the indices of which rows of `SelectedRows` are non-zeros. The `value_` field is an N-dim tensor and shape is `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
 Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
@ -25,7 +25,7 @@ x = SelectedRow {
 ## SelectedRows in Protobuf
-`SelectedRows` is a kind of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time since the `rows_` and `value_` are related to training data. 
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. 
 So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
 ```proto
@ -54,7 +54,7 @@ message VarDesc {
 ## InferShape for Selected Rows
-Just like `LoD` information, `InferShape` method will inference output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
 For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
@ -68,7 +68,7 @@ void TableLookupGrad::InferShape(context) {
 ## Sparse Operators
-There are several operators should be written to support `SelectedRows`. They are:
+There are several operators that need to be written to support `SelectedRows`. These are:
-1. Operators which generates `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
 2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
@ -1,9 +1,46 @@
 # 构建Android平台上的PaddlePaddle库
-用户可通过交叉编译的方式，在用户熟悉的开发平台（Linux，Mac OS X和Windows）上编译Android平台上适用的PaddlePaddle库。
+用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式
 - 基于Linux交叉编译环境的编译方式
 ## 基于Docker容器的编译方式
 Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行，因此，使用基于Docker容器的编译方式，用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
 ### 构建PaddlePaddle的Android开发镜像
 我们把PaddlePaddle的交叉编译环境打包成一个镜像，称为开发镜像，里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。
 ```bash
 $ git clone https://github.com/PaddlePaddle/Paddle.git
 $ cd Paddle
 $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 ```
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
 | Argument        | Optional Values         | Default |
 |-----------------|-------------------------|---------|
 |`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
 |`ANDROID_API`    |`>= 21` | `21` |
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 ```bash
 $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
 ```
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 ```bash
 $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
 ```
 执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
-## 准备交叉编译环境
+### 准备交叉编译环境
 从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn)，用户可自行前往下载预编译好的版本，也可通过以下命令获取：
@ -13,18 +50,27 @@ unzip -q android-ndk-r14b-linux-x86_64.zip
 ```
 Android NDK中包含了所有Android API级别、所有架构（arm/arm64/x86/mips）需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别，构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
 比如：
 - 构建`armeabi-v7a`、 `Android API 21`的独立工具链：
 ```bash
 your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
 ```
 此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
 - 构建`arm64-v8a`、 `Android API 21`的独立工具链：
 ```bash
 your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-        --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
 ```
-此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。
+此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链，面向架构为64位ARM64架构，支持的最小Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
-注意：**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。
+注意：**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
-## 配置交叉编译参数
+### 配置交叉编译参数
 CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
@ -36,23 +82,43 @@ CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cm
 Android平台可选配置参数：
 - `ANDROID_STANDALONE_TOOLCHAIN`，独立工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别；否则，用户需要在cmake时手动设置这些值。无默认值。
- `ANDROID_ABI`，目标架构ABI。目前只支持`armeabi-v7a`，默认值为`armeabi-v7a`。
+- `ANDROID_TOOLCHAIN`，目标工具链。可设置`gcc/clang`，默认值为`clang`。
 	- CMake 3.7以上，将会始终使用`clang`工具链；CMake 3.7以下，可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
 	- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
 - `ANDROID_ABI`，目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`，默认值为`armeabi-v7a`。
 - `ANDROID_NATIVE_API_LEVEL`，工具链的Android API级别。若没有显式设置，PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
- `ANROID_ARM_MODE`，是否使用ARM模式。可设置`ON/OFF`，默认值为`ON`。
+- `ANROID_ARM_MODE`，是否使用ARM模式。
- `ANDROID_ARM_NEON`，是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
 	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
 - `ANDROID_ARM_NEON`，是否使用NEON指令。
 	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
 	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
 其他配置参数：
 - `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算。可设置`ON/OFF`，默认值为`OFF`。
 - `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
-一种常用的cmake配置如下：
+常用的cmake配置如下：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
      -DANDROID_ABI=armeabi-v7a \
      -DANDROID_ARM_NEON=ON \
      -DANDROID_ARM_MODE=ON \
      -DUSE_EIGEN_FOR_BLAS=ON \
      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
      -DWITH_C_API=ON \
      -DWITH_SWIG_PY=OFF \
      ..
 ```
 ```
 cmake -DCMAKE_SYSTEM_NAME=Android \
      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
      -DANDROID_ABI=arm64-v8a \
      -DUSE_EIGEN_FOR_BLAS=OFF \
      -DCMAKE_INSTALL_PREFIX=your/path/to/install \  
      -DWITH_C_API=ON \
      -DWITH_SWIG_PY=OFF \
@ -61,7 +127,12 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
 用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
-## 编译和安装
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
 - 设置`CMAKE_BUILD_TYPE`为`Release`
 - 使用`clang`编译工具链
 - `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
 ### 编译和安装
 CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
@ -72,4 +143,4 @@ make install
 注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
-执行完安装命令后，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Android版本的库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
+执行完安装命令后，`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录，其中`include`中包含C-API的头文件，`lib`中包含若干个不同Android ABI的PaddlePaddle库，`third_party`中包含所依赖的所有第三方库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
--- a/doc/howto/usage/cluster/src/trainer.png
+++ b/doc/howto/usage/cluster/src/trainer.png
--- a/doc/howto/usage/cluster/src/trainer_cn.png
+++ b/doc/howto/usage/cluster/src/trainer_cn.png
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
@ -0,0 +1,100 @@
 import gzip
 import math
 import paddle.v2 as paddle
 embsize = 32
 hiddensize = 256
 N = 5
 def wordemb(inlayer):
    wordemb = paddle.layer.embedding(
        input=inlayer,
        size=embsize,
        param_attr=paddle.attr.Param(
            name="_proj",
            initial_std=0.001,
            learning_rate=1,
            l2_rate=0,
            sparse_update=True))
    return wordemb
 def main():
    # for local training
    cluster_train = False
    if not cluster_train:
        paddle.init(use_gpu=False, trainer_count=1)
    else:
        paddle.init(
            use_gpu=False,
            trainer_count=2,
            port=7164,
            ports_num=1,
            ports_num_for_sparse=1,
            num_gradient_servers=1)
    word_dict = paddle.dataset.imikolov.build_dict()
    dict_size = len(word_dict)
    firstword = paddle.layer.data(
        name="firstw", type=paddle.data_type.integer_value(dict_size))
    secondword = paddle.layer.data(
        name="secondw", type=paddle.data_type.integer_value(dict_size))
    thirdword = paddle.layer.data(
        name="thirdw", type=paddle.data_type.integer_value(dict_size))
    fourthword = paddle.layer.data(
        name="fourthw", type=paddle.data_type.integer_value(dict_size))
    nextword = paddle.layer.data(
        name="fifthw", type=paddle.data_type.integer_value(dict_size))
    Efirst = wordemb(firstword)
    Esecond = wordemb(secondword)
    Ethird = wordemb(thirdword)
    Efourth = wordemb(fourthword)
    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
    hidden1 = paddle.layer.fc(input=contextemb,
                              size=hiddensize,
                              act=paddle.activation.Sigmoid(),
                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
                              bias_attr=paddle.attr.Param(learning_rate=2),
                              param_attr=paddle.attr.Param(
                                  initial_std=1. / math.sqrt(embsize * 8),
                                  learning_rate=1))
    predictword = paddle.layer.fc(input=hidden1,
                                  size=dict_size,
                                  bias_attr=paddle.attr.Param(learning_rate=2),
                                  act=paddle.activation.Softmax())
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
                               'w') as f:
                    trainer.save_parameter_to_tar(f)
                result = trainer.test(
                    paddle.batch(
                        paddle.dataset.imikolov.test(word_dict, N), 32))
                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    result.metrics)
    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
    parameters = paddle.parameters.create(cost)
    adagrad = paddle.optimizer.AdaGrad(
        learning_rate=3e-3,
        regularization=paddle.optimizer.L2Regularization(8e-4))
    trainer = paddle.trainer.SGD(cost,
                                 parameters,
                                 adagrad,
                                 is_local=not cluster_train)
    trainer.train(
        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
        num_passes=30,
        event_handler=event_handler)
 if __name__ == '__main__':
    main()
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
@ -0,0 +1,123 @@
 import math
 import os
 import paddle.v2 as paddle
 import pickle
 embsize = 32
 hiddensize = 256
 N = 5
 cluster_train_file = "./train_data_dir/train/train.txt"
 cluster_test_file = "./test_data_dir/test/test.txt"
 node_id = os.getenv("OMPI_COMM_WORLD_RANK")
 if not node_id:
    raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
 def wordemb(inlayer):
    wordemb = paddle.layer.embedding(
        input=inlayer,
        size=embsize,
        param_attr=paddle.attr.Param(
            name="_proj",
            initial_std=0.001,
            learning_rate=1,
            l2_rate=0,
            sparse_update=True))
    return wordemb
 def cluster_reader_cluster(filename, node_id):
    def cluster_reader():
        with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
            for l in f:
                csv_data = [int(cell) for cell in l.split(",")]
                yield tuple(csv_data)
    return cluster_reader
 def main():
    # get arguments from env
    # for local training
    TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
    cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
    use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
    if not cluster_train:
        paddle.init(
            use_gpu=use_gpu,
            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
    else:
        paddle.init(
            use_gpu=use_gpu,
            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
            port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
            ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
            ports_num_for_sparse=int(
                os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
            num_gradient_servers=int(
                os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
            trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
            pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
    fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
    word_dict = pickle.load(fn)
    fn.close()
    dict_size = len(word_dict)
    firstword = paddle.layer.data(
        name="firstw", type=paddle.data_type.integer_value(dict_size))
    secondword = paddle.layer.data(
        name="secondw", type=paddle.data_type.integer_value(dict_size))
    thirdword = paddle.layer.data(
        name="thirdw", type=paddle.data_type.integer_value(dict_size))
    fourthword = paddle.layer.data(
        name="fourthw", type=paddle.data_type.integer_value(dict_size))
    nextword = paddle.layer.data(
        name="fifthw", type=paddle.data_type.integer_value(dict_size))
    Efirst = wordemb(firstword)
    Esecond = wordemb(secondword)
    Ethird = wordemb(thirdword)
    Efourth = wordemb(fourthword)
    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
    hidden1 = paddle.layer.fc(input=contextemb,
                              size=hiddensize,
                              act=paddle.activation.Sigmoid(),
                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
                              bias_attr=paddle.attr.Param(learning_rate=2),
                              param_attr=paddle.attr.Param(
                                  initial_std=1. / math.sqrt(embsize * 8),
                                  learning_rate=1))
    predictword = paddle.layer.fc(input=hidden1,
                                  size=dict_size,
                                  bias_attr=paddle.attr.Param(learning_rate=2),
                                  act=paddle.activation.Softmax())
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                result = trainer.test(
                    paddle.batch(
                        cluster_reader_cluster(cluster_test_file, node_id), 32))
                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    result.metrics)
    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
    parameters = paddle.parameters.create(cost)
    adagrad = paddle.optimizer.AdaGrad(
        learning_rate=3e-3,
        regularization=paddle.optimizer.L2Regularization(8e-4))
    trainer = paddle.trainer.SGD(cost,
                                 parameters,
                                 adagrad,
                                 is_local=not cluster_train)
    trainer.train(
        paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
        num_passes=30,
        event_handler=event_handler)
 if __name__ == '__main__':
    main()
--- a/doc/howto/usage/cluster/src/word2vec/prepare.py
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
@ -0,0 +1,41 @@
 import paddle.v2 as paddle
 import tarfile
 import os
 import pickle
 SPLIT_COUNT = 3
 N = 5
 def file_len(fd):
    for i, l in enumerate(fd):
        pass
    return i + 1
 def split_from_reader_by_line(filename, reader, split_count):
    fn = open(filename, "w")
    for batch_id, batch_data in enumerate(reader()):
        batch_data_str = [str(d) for d in batch_data]
        fn.write(",".join(batch_data_str))
        fn.write("\n")
    fn.close()
    fn = open(filename, "r")
    total_line_count = file_len(fn)
    fn.close()
    per_file_lines = total_line_count / split_count + 1
    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
    os.system(cmd)
 word_dict = paddle.dataset.imikolov.build_dict()
 with open("word_dict.pickle", "w") as dict_f:
    pickle.dump(word_dict, dict_f)
 split_from_reader_by_line("train.txt",
                          paddle.dataset.imikolov.train(word_dict, N),
                          SPLIT_COUNT)
 split_from_reader_by_line("test.txt",
                          paddle.dataset.imikolov.test(word_dict, N),
                          SPLIT_COUNT)
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@ -137,7 +137,7 @@ func (c *Client) FinishInitParams() error {
 			return err
 		}
 	}
-	return nil
+	return c.sel.Done()
 }
 // SendGrads sends gradients to parameter servers for updating
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@ -28,9 +28,9 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 add_dependencies(paddle_capi paddle_proto)
-# combine all paddle static libraries together, into libpaddle_capi_whole.a
+# TODO: paddle_capi_whole will be removed.
-# user should use PaddleCAPI as -lpaddle_capi_whole
+if(MOBILE_INFERENCE)
-set(PADDLE_CAPI_INFER_LIBS
+    set(PADDLE_CAPI_INFER_LIBS
        paddle_utils
        paddle_parameter
        paddle_math
@ -38,13 +38,27 @@ set(PADDLE_CAPI_INFER_LIBS
        paddle_function
        paddle_gserver
        paddle_proto)
-
+else()
    set(PADDLE_CAPI_INFER_LIBS
        paddle_utils
        paddle_parameter
        paddle_math
        paddle_cuda
        paddle_function
        paddle_gserver
        paddle_proto
        paddle_pserver
        paddle_network)
 endif()
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
-# No shared library for iOS
+# Link the static library for inference
 cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
 cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
 # Link the shared library for inference
 if(NOT IOS)
-  set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
  # TODO: merge mkl into paddle_capi_shared
  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
@ -53,9 +67,10 @@ endif()
 # install library & headers.
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
 install(FILES paddle_capi.map DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
 if(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_shared
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
          ARCHIVE DESTINATION lib/${ANDROID_ABI}
          LIBRARY DESTINATION lib/${ANDROID_ABI})
  execute_process(
@ -80,7 +95,7 @@ if(ANDROID)
      )"
  )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
  if(NOT IOS)
    install(TARGETS paddle_capi_shared DESTINATION lib)
  endif()
--- a/paddle/capi/export.sym
+++ b/paddle/capi/export.sym
--- a/Show More
+++ b/Show More