Merge branch 'develop' of github.com:PaddlePaddle/Paddle into prefech_prog_on_server

8 years ago · abfd9fe798
parent 1842758117 d139f2ca4e
commit abfd9fe798
180 changed files with 4040 additions and 696 deletions
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec

-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>

 ### Measure the Performance for Different PServer Count

@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec

-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>PServer Count  </th>
+<th>10</th>
+<th>20</th>
+<th>40 </th>
+<th>60</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>

 ### Measure Parallel Efficiency By Increasing Trainer Count

@ -67,11 +127,69 @@ The parallel efficiency is:

 $E = \div(S, N)$

-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Counter  </th>
+<th>1</th>
+<th>10</th>
+<th>20 </th>
+<th>30</th>
+<th>40</th>
+<th>50</th>
+<th>60 </th>
+<th>70</th>
+<th>80</th>
+<th>90</th>
+<th>100 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
+

 ## Reproduce the benchmark

--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.

 - Metrics: samples / sec

-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 15.44 </td>
+<td> 16.32 </td>
+<td> 16.74 </td>
+<td> 16.79 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 15.97 </td>
+<td> 17.04 </td>
+<td> 17.60 </td>
+<td> 17.83 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> 9.09 </td>
+<td> 9.10 </td>
+<td> 9.24 </td>
+<td> 8.66 </td>
+</tr>
+</tbody>
+</table>
+

 ### Different Batch Size

@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec

-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 190.20 </td>
+<td> 222.15 </td>
+<td> 247.40 </td>
+<td> 258.18 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 170.96 </td>
+<td> 233.71 </td>
+<td> 256.14 </td>
+<td> 329.23 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>

 ### Accelerate Rate

@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec

-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Count </th>
+<th>20</th>
+<th>40</th>
+<th>80</th>
+<th>100</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 263.29 (78.64%) </td>
+<td> 518.80 (77.47%) </td>
+<td> 836.26 (62.44%) </td>
+<td> 1019.29 (60.89%) </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 326.85 (92.85%) </td>
+<td> 534.58 (75.93%) </td>
+<td> 853.30 (60.60%) </td>
+<td> 1041.99 (59.20%) </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+

 ### Different Pserver Count

@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec

-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>PServer Count </th>
+<th>3</th>
+<th>6</th>
+<th>10</th>
+<th>20</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid(should fix in next PR) </td>
+<td> 589.1 </td>
+<td> 592.6 </td>
+<td> 656.4 </td>
+<td> 655.8 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 593.4 </td>
+<td> 791.3 </td>
+<td> 729.7 </td>
+<td> 821.7 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+

 *The performance gap between Fuild and v2 comes from the network interference.*

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -494,6 +494,12 @@ reshape
 ..  autofunction:: paddle.fluid.layers.reshape
    :noindex:

+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 scale
 -----

--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@ -5,9 +5,11 @@ In a large scale machine learning setup where the size of the training data is h

 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.

-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:

-<img src="./images/asgd.gif" align="center"/><br/>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
+</p>

 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.

--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@ -6,11 +6,33 @@ Here are some initial thoughts. Your comments are welcome!

 I think we need only the following few CMake functions to make a project description mean and clean:

-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+

 - The `_library` functions generate  .a files from source code.
 - The `_binary` functions generate executable binary files.
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local

 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:

-| programming languages | PaddlePaddle          |
-|-----------------------|-----------------------|
-| for, while loop       | RNN, WhileOp          |
-| if, if-else, switch   | IfElseOp, SwitchOp    |
-| sequential execution  | a sequence of layers  |
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+

 A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.

@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas

 The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:

-| programming languages | PaddlePaddle                    |
-|-----------------------|---------------------------------|
-| stack                 | scope hierarchy                 |
-| stack frame           | scope                           |
-| push at entering block| push at entering block          |
-| pop at leaving block  | destroy when minibatch completes|
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+

 1. In traditional programs:

--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@ -86,12 +86,40 @@ def layer.fc(X):

 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:

-
-| C++ functions/functors | mul          | add          |             |          |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class     | mulOp        | addOp        | FCOp        |          |
-| Python binding         | operator.mul | operator.add | operator.fc |          |
-| Python function        |              |              |             | layer.fc |
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>


 This is how we differentiate layer and operators in PaddlePaddle:
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@ -2,12 +2,38 @@

 Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.

-|                       | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN                   | Support    | Support      |
-| recursive RNN         | Support    | Support      |
-| padding zeros         | Must       | No need      |
-| blob data type        | Tensor     | LoDTensor    |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+

 PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.

--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@ -10,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :

 The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.

-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>


 ## Definition of VarType
--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
@ -2,7 +2,7 @@

 ## Introduction

-A Channel is a data structure that allows for synchronous interprocess 
+A Channel is a data structure that allows for synchronous interprocess
 communication via message passing.  It is a fundemental component of CSP
 (communicating sequential processes), and allows for users to pass data
 between threads without having to worry about synchronization.
@ -18,7 +18,7 @@ Creates a new channel that takes in variables of a specific dtype.

 - **fluid.make_channel(dtype, capacity=0)**
  - **dtype**: The data type of variables being sent/received through channel
-  - **capacity**: The capacity of the channel.  A capacity of 0 represents 
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents
    an unbuffered channel.  Capacity > 0 represents a buffered channel

 ```
@ -40,8 +40,8 @@ fluid.channel_close(ch)

 ### Send data to a channel

-Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`, 
-`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and 
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
 `ChannelHolder` are supported.

 By default, the data of the Variable is moved from the sender to the receiver,
@ -52,7 +52,7 @@ however the user can optionally copy the data before performing the send.
  - **variable**: The variable to send to the channel
  - **is_copy**: If set to True, channel_send will perform a variable assign
  to copy the source variable to a new variable to be sent.
-  
+
 ```
 ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
@ -68,7 +68,7 @@ receiving variable.
  - **channel**: The channel to receive the variable from
  - **return_variable**: The destination variable used to store the data of the
  variable received from the channel
-  
+
 ```
 ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
@ -84,9 +84,9 @@ internal queues, locks, and conditional variables.
 ### QueueMessage

 QueueMessage encapsulates the state of the channel send/receive operation to be
-put in the **sendq/recvq**.  It contains a condition variable used to lock the 
+put in the **sendq/recvq**.  It contains a condition variable used to lock the
 thread (when there are no available sends/receives).  In addition, it contains
-a callback function to notify a thread when the QueueMessage is being 
+a callback function to notify a thread when the QueueMessage is being
 processed by the channel.

 ### Queues
@ -108,21 +108,21 @@ channel_recv operation will put a new QueueMessage on the recvq and block the
 current thread under two conditions:
  1. The channel is buffered and there is no data on the buff_
  2. The channel is unbuffered and does not have a sender
-  
+
 ### State diagram

 #### Channel Send

 <p align="center">
-<img src="./images/channel_send.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
 </p>
-  
+
 #### Channel Receive

 <p align="center">
-<img src="./images/channel_recv.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
 </p>
-  
+
 ## Limitations and Considerations

 ### Variable Copy
@ -135,5 +135,5 @@ be sent before it is sent.

 Please note that this is acheived by adding an **assign** operator and creating
 a temporary variable that is sent in place of the original variable.  Please
-note that **assign** operator has limited support for only certain variables 
+note that **assign** operator has limited support for only certain variables
 datatypes.
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@ -10,12 +10,42 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn

 The following table compares concepts in Fluid and Go

-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+<td></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+<td></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+<td></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+<td></td>
+</tr>
+</tbody>
+</table>
+

 ## An Example Concurrent Program

@ -77,11 +107,11 @@ message ProgramDesc {
      read(output = X)
      kube_get_workers_addrs(output = L)
      Y = tensor_array(len(L))
-      parallel_for(input = X, output = Y, 
+      parallel_for(input = X, output = Y,
                   attrs = {L, block_id(1)}) # referring to block 1
    ]
  }
-  
+
  block[1] = Block {
    parent = 0,
    vars = [x, y, index],
@ -102,7 +132,7 @@ func main() {  //// block 0
  X = fluid.read(...)
  L = fluid.k8s.get_worker_addrs()
  Y = fluid.tensor_array(len(L))
-  fluid.parallel_for(X, L, 
+  fluid.parallel_for(X, L,
                     func(index int) {  //// block 1
                       x = X[index]
                       fluid.send(L[index], x)
@ -116,7 +146,7 @@ An explanation of the above program:

 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,

  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe

 There were many concurrent programming models, implemented in various forms:

-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+

 Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.

@ -118,9 +145,9 @@ There are four types of actions with a channel:
   ```go
   close(ch)
   ```
-   
+
   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-   
+
 There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):

 1. A send to a nil channel blocks forever
--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
@ -2,13 +2,13 @@

 ## Introduction

-In golang, the [**select**](https://golang.org/ref/spec#Select_statements) 
-statement lets a goroutine wait on multiple communication operations at the 
-same time. The **select** blocks until one of its cases can run, then 
-executes the case.  If multiple cases are ready to run, then one case is 
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case.  If multiple cases are ready to run, then one case is
 choosen at random to be executed.

-With the introduction of CSP for Paddle, we mimic this behavior by 
+With the introduction of CSP for Paddle, we mimic this behavior by
 creating a ***select_op***.

 ## How to use it
@ -17,11 +17,11 @@ The **select_op** is available as a c++ operator.  However most users
 will prefer to use the much simplier Python API.

 - **fluid.Select()**: Creates a select operator and adds it to the current
-block within the main program.  Also creates a sub block and adds it to the 
-main program.  This sub block is used to hold all variables and operators 
+block within the main program.  Also creates a sub block and adds it to the
+main program.  This sub block is used to hold all variables and operators
 used by the case statements.
- 
-Within the select block, users can add cases by 
+
+Within the select block, users can add cases by
 calling **select.case** or **select.default** method.

 - **fluid.Select.case(channel_action, channel, result_variable)**: Represents
@ -37,13 +37,13 @@ execute.
 ```
 ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            
+
 x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
 y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
- 
+
 while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
 while_op = While(cond=while_cond)    
- 
+
 with while_op.block():
    with fluid.Select() as select:
        with select.case(fluid.channel_send, channel, x):
@ -99,17 +99,17 @@ blocks {
    }
  }
  // Create "select" operator.
-  // inputs: 
+  // inputs:
  //   X: All input variables used by operators within the select block
  //   case_to_execute: Variable filled in by select_op when it determines
  //     which case to execute.
  //  
  // outputs:
-  //   Out: All output variables referenced by operators within select block. 
-  // 
+  //   Out: All output variables referenced by operators within select block.
+  //
  // attrs:
  //   sub_block: The block id containing the select "cases"
-  //   cases:  Serialized list of all cases in the select op. 
+  //   cases:  Serialized list of all cases in the select op.
  //     Each case is serialized as: '<index>,<type>,<channel>,<value>'
  //     where type is 0 for default, 1 for send, and 2 for receive.
  //     No channel and values are needed for default cases.
@ -150,7 +150,7 @@ into **X**.  It will also create a temp variable called **case_to_execute**.  Th
 filled in by the select_op after it has completed processing the case statements.

 If there are no available cases to execute (ie: all cases are blocked on channel operations, and
-there is no default statement), then the select_op will block the current thread.  The thread will 
+there is no default statement), then the select_op will block the current thread.  The thread will
 unblock once there is a channel operation affecting one of the case statements, at which point, the
 **select_op** will set the **case_to_execute** variable to the index of the case to execute.

@ -247,17 +247,17 @@ blocks {

 ```

-Cases are represented by a **conditional_block operator**, whose's condition is set as the output of 
-equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block, 
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block,
 only one case will be executed.

 ### select_op flow

 <p align="center">
-<img src="./images/select_op_workflow.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
 </p>

-The select algorithm is inspired by golang's select routine.  Please refer to 
+The select algorithm is inspired by golang's select routine.  Please refer to
 http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.

 ## Backward Pass
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,

 Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:

-<img src="src/compiler.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>

 PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:

-<img src="src/paddle-compile.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>

 The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.

@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -

 The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:

-<img src="src/distributed_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>

 The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.

@ -152,7 +152,7 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.

-<img src="src/remote_executor.png" width="500" align="center" />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />

 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m

 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:

-<img src="src/local_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>


 ### Training Data
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.

 ## Transpiler

-<img src="src/multi-threads/single-thread@3x.png" width="300">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">

 After converted:

-<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">

 ## Implement

--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:

-<img src="src/local-graph.png" width="300"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>

 After converting:

-<img src="src/dist-graph.png" width="700"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>

 1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:

-<img src="src/sparse_update.png" width="700" />
-
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
 ### Benefits

 - Model parallelism becomes easier to implement: it is an extension to
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation

 <p align="center">
-<img src="./rnn.jpg"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
 </p>

 The above diagram shows an RNN unrolled into a full network.
@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.

 <p align="center">
-<img src="./rnn.png"/><br/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>

@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.

 <p align="center">
-<img src="./2_level_rnn.png"/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/2_level_rnn.png"/>
 </p>

 ```python
@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st


 <p align="center">
-<img src="./rnn_2level_data.png"/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn_2level_data.png"/>
 </p>
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
@ -2,7 +2,7 @@

 ## What is batch normalization

-Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training. 
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.

 The principle of batch normalization can be summarized into a simple function:

@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu

 The following graph showes the training computational process of `batch_norm_op`:

-<img src="../images/batch_norm_op_kernel.png" width="800"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>

 cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.

@ -74,13 +74,13 @@ cudnn provides APIs to finish the whole series of computation, we can use them i

 `batch_norm_op` is warpped as a layer in Python:

-```python 
-def batch_norm_layer(net, 
+```python
+def batch_norm_layer(net,
                     input,
-                     output, 
-                     scale, 
-                     bias, 
-                     use_global_est = False, 
+                     output,
+                     scale,
+                     bias,
+                     use_global_est = False,
                     epsilon = 1e-6,
                     momentum = 0.99):
 	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
@ -119,15 +119,15 @@ for pass_id in range(PASS_NUM):
    if pass_id % 100 == 0:
        net.infer(test_image)    # run inferencing model
    # ...
-``` 
+```

 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:

 <div align=center>
-<img src="../images/batch_norm_fork.png" width="500"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
 </div>

-Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. 
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.

 When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.

--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@ -2,12 +2,33 @@

 Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.

-| Python classes | Protobuf messages |
-| --- | --- |
-| Program | ProgramDesc |
-| Block | BlockDesc |
-| Operator | OpDesc |
-| Variable | VarDesc |
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+

 Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.

--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@ -6,23 +6,23 @@ A central problem in machine learning is how to design an algorithm that will pe
 ### Parameter Norm Penalties
 Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:

-<img src="./images/loss_equation.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>

 The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.

 The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:

 ##### L2 Regularization:
-<img src="./images/l2_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>

 ##### L1 Regularization
-<img src="./images/l1_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>

 A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).

 ## Regularization Survey

-A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). 
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).

 ## Proposal for Regularization in PaddlePaddle

@ -32,41 +32,35 @@ In the new design, we propose to create new operations for regularization. For n
 - L2_regularization_op
 - L1_regularization_op

-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. 
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.

-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.

 ### Computation Graph

 Below is an example of a really simple feed forward neural network.

-<img src="./images/feed_forward.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>

 The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:

-<img src="./images/feed_forward_regularized.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
    
 ### Python API implementation for Regularization

-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.

 #### Creation of Regularization ops
 There are two possibilities for creating the regularization ops:
-1. We create these ops immediately while building the computation graph. 
-2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.

-The proposal is to add these ops in a lazy manner just before the backward pass. 
+The proposal is to add these ops in a lazy manner just before the backward pass.

 #### Storage of Regularization attributes

-Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.

 #### High-level API

 In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
-
-
-
-
-
-    
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@ -10,11 +10,37 @@ Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution

 Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.

-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+

 From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.

--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation

 At runtime, the C++ program realizes the graph and runs it.

-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+

 The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).

--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@ -1,4 +1,4 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc 
+# DeepSpeech2 on PaddlePaddle: Design Doc

 We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:

@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:

 Tasks parallelizable within phases:

-Roadmap     | Description                               | Parallelizable Tasks 
----------- | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+

 Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!

@ -94,7 +116,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
 - **One** CTC-loss layer

 <div align="center">
-<img src="images/ds2_network.png" width=350><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
 Figure 1. Archetecture of Deep Speech 2 Network.
 </div>

@ -102,37 +124,82 @@ We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar

 Key ingredients about the layers:

- **Data Layers**: 
+- **Data Layers**:
   - Frame sequences data of audio **spectrogram** (with FFT).
-   - Token sequences data of **transcription** text (labels). 
+   - Token sequences data of **transcription** text (labels).
   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
- **2D Convolution Layers**: 
+- **2D Convolution Layers**:
   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
   - With striding for only the first convlution layer.
   - No pooling for all convolution layers.
- **Uni-directional RNNs** 
+- **Uni-directional RNNs**
 	- Uni-directional + row convolution: for low-latency inference.
 	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
 - **Row convolution**:
 	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
-	- Not nessesary if with bi-direcitional RNNs. 
+	- Not nessesary if with bi-direcitional RNNs.
 	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
 - **Batch Normalization Layers**:
   - Added to all above layers (except for data and loss layer).
   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
- 
-
-Required Components                     | PaddlePaddle Support                      | Need to Develop
-:-------------------------------------  | :--------------------------------------   | :-----------------------
-Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
-Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
-DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
-Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
-Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
-CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
-Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
-CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
+
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+

 ### Row Convolution

@ -141,18 +208,18 @@ TODO by Assignees
 ### Beam Search with CTC and LM

 <div align="center">
-<img src="images/beam_search.png" width=600><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
 Figure 2. Algorithm for CTC Beam Search Decoder.
 </div>

- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
-   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
 - An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
 - Such external scorer consists of language model, word count or any other custom scorers.
 - The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
- 
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+

 ## Future Work

--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@ -199,7 +199,7 @@ Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail i

 ## LoD and shape changes during decoding
 <p align="center">
-  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
 </p>

 According to the image above, the only phase that changes the LoD is beam search.
--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
@ -1,24 +1,24 @@
 # Design for GAN

-GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.

 It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.

 In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.

 <p align="center">
-<img src="./test.dot.png" width = "35%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
 Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
 </p>

 The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.

 <p align="center">
-<img src="./dcgan.png" width = "90%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
 Figure 2. Photo borrowed from the original DC-GAN paper.
 </p>

-## The Conditional-GAN might be a class. 
+## The Conditional-GAN might be a class.
 This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:

 - DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
@ -29,7 +29,7 @@ This design we adopt the popular open source design in https://github.com/carped
 Returns a generated image.

 - discriminator(image):
-Given an image, decide if it is from a real source or a fake one. 
+Given an image, decide if it is from a real source or a fake one.
 Returns a 0/1 binary label.

 - build_model(self):
@ -47,7 +47,7 @@ To be more detailed, we introduce our design of DCGAN as following:
 ```python
 class DCGAN(object):
  def __init__(self, y_dim=None):
-  
+
    # hyper parameters  
    self.y_dim = y_dim # conditional gan or not
    self.batch_size = 100
@ -82,18 +82,18 @@ class DCGAN(object):
    # input z: the random noise
    # input y: input data label (optional)
    # output G_im: generated fake images
-    
+
    if not self.y_dim:
      z = pd.layer.concat(1, [z, y])
-      
+
    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
    G_h0_bn = pd.layer.batch_norm(G_h0)
    G_h0_relu = pd.layer.relu(G_h0_bn)
-    
+
    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
    G_h1_bn = pd.layer.batch_norm(G_h1)
    G_h1_relu = pd.layer.relu(G_h1_bn)
-    
+
    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
    G_im = pd.layer.tanh(G_im)
    return G_im
@ -111,11 +111,11 @@ class DCGAN(object):
    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
    D_h0_bn = pd.layer.batchnorm(h0)
    D_h0_relu = pd.layer.lrelu(h0_bn)
-    
+
    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
    D_h1_bn = pd.layer.batchnorm(D_h1)
    D_h1_relu = pd.layer.lrelu(D_h1_bn)
-    
+
    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
    return D_h2
 ```
@ -123,7 +123,7 @@ class DCGAN(object):
 ### Class member function: Build the model
 - Define data readers as placeholders to hold the data;
 - Build generator and discriminators;
- Define two training losses for discriminator and generator, respectively. 
+- Define two training losses for discriminator and generator, respectively.
 If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
 ```python
 class DCGAN(object):
@ -133,7 +133,7 @@ class DCGAN(object):
    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
    self.z = pd.data(tf.float32, [None, self.z_size])
-    
+
    # step 1: generate images by generator, classify real/fake images with discriminator
    if self.y_dim: # if conditional GAN, includes label
        self.G = self.generator(self.z, self.y)
@ -147,12 +147,12 @@ class DCGAN(object):
        # generate fake images
        self.sampled = self.sampler(self.z)
        self.D_f = self.discriminator(self.images)
-    
+
    # step 2: define the two losses
    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
    self.d_loss = self.d_loss_real + self.d_loss_fake
-    
+
    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
 ```

@ -176,7 +176,7 @@ class DCGAN(object):
        self.G = self.generator(self.z)
        self.D_g = self.discriminator(self.G, self.y)
      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
-    
+
    with pd.default_block().d_block():
      if self.y_dim: # if conditional GAN, includes label
        self.D_t = self.discriminator(self.images, self.y)
@ -217,7 +217,7 @@ if __name__ == "__main__":

    # load mnist data
    data_X, data_y = self.load_mnist()
-    
+
    # Two subgraphs required!!!
    with pd.block().d_block():
      d_optim = pd.train.Adam(lr = .001, beta= .1)
@ -228,7 +228,7 @@ if __name__ == "__main__":

    # executor
    sess = pd.executor()
-    
+
    # training
    for epoch in xrange(10000):
      for batch_id in range(N / batch_size):
@ -239,7 +239,7 @@ if __name__ == "__main__":
        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])

        if batch_id % 2 == 0:
-          sess.run(d_step, 
+          sess.run(d_step,
                   feed_dict = {dcgan.images: batch_im,
                                dcgan.y: batch_label,
                                dcgan.z: batch_z})
--- a/Show More
+++ b/Show More