Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_avg

7 years ago · d1a7b47e04
parent 764723d4f5 f0af1398b8
commit d1a7b47e04
165 changed files with 4844 additions and 533 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -53,8 +53,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
+option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
 endif()
 if (WITH_C_API)
-  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
 endif()
 if(MOBILE_INFERENCE)
@ -147,6 +146,7 @@ include(external/cares)
 include(external/grpc)
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@ -0,0 +1,30 @@
 INCLUDE(ExternalProject)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
 INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
 ExternalProject_Add(
    extern_threadpool
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/progschj/ThreadPool.git"
    GIT_TAG         9a42ec1329f259a5f4881a291db1dcb8f2ad9040
    PREFIX          ${THREADPOOL_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
    BUILD_COMMAND     ""
    INSTALL_COMMAND   ""
    TEST_COMMAND      ""
 )
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
    file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
    add_library(simple_threadpool STATIC ${dummyfile})
 else()
    add_library(simple_threadpool INTERFACE)
 endif()
 add_dependencies(simple_threadpool extern_threadpool)
 LIST(APPEND external_project_dependencies simple_threadpool)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME)
  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here 
  # for now to enable dist CI.
  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
@ -597,6 +600,9 @@ function(grpc_library TARGET_NAME)
          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
          ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
          "${ABS_PROTO}"
          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
--- a/doc/design/images/parallel_executor_overview.dot
+++ b/doc/design/images/parallel_executor_overview.dot
@ -0,0 +1,83 @@
 digraph G {
  subgraph cluster_init {
    label="Initialization"
    startup_program [label="startup", shape=box]
    node_w_g0 [label="W\nGPU0"]
    startup_program -> node_w_g0 [label="Initialize"]
    node_w_g1 [label="W\nGPU1"]
    node_w_g0 -> node_w_g1 [label="broadcast"]
  }
  subgraph cluster_train {
    label="forward_backward"
    subgraph cluster_gpu0 {
      label="GPU0"
      fc_0 [label="fc\nGPU0", shape=box]
      hidden_0 [label="hidden\nGPU0"]
      node_w_g0 -> fc_0
      fc_0 -> hidden_0
      loss0 [label="loss\nGPU0"]
      hidden_0 -> loss0 [label="many ops omitted"]
      scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
      loss_g0 [label="loss_grad\nGPU0"]
      scale_loss_0->loss_g0
      fc_g_0 [label="w_grad\nGPU0", shape=box]
      loss0 -> fc_g_0
      loss_g0 -> fc_g_0
      hidden_0 -> fc_g_0
    }
    subgraph cluster_gpu1 {
      label="GPU1"
      fc_1 [label="fc\nGPU1", shape=box]
      hidden_1 [label="hidden\nGPU1"]
      node_w_g1 -> fc_1
      fc_1 -> hidden_1
      loss1 [label="loss\nGPU1"]
      hidden_1 -> loss1 [label="many ops omitted"]
      scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
      loss_g1 [label="loss_grad\nGPU1"]
      scale_loss_1->loss_g1
      fc_g_1 [label="w_grad\nGPU1", shape=box]
      loss1 -> fc_g_1
      loss_g1 -> fc_g_1
      hidden_1 -> fc_g_1
    }
  }
  all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
  fc_g_0 -> all_reduce_w
  fc_g_1 -> all_reduce_w
  fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
  fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
  all_reduce_w -> fc_g_0_merged
  all_reduce_w -> fc_g_1_merged
  subgraph cluster_optimization {
    label="Optimization"
    subgraph cluster_opt_gpu0 {
      label="GPU0"
      sgd_0 [label="SGD Op\nGPU0", shape=box]
      fc_g_0_merged -> sgd_0
      node_w_g0 -> sgd_0
      optimized_w_0 [label="Optimized W\nGPU0"]
      sgd_0 -> optimized_w_0
    }
    subgraph cluster_opt_gpu1 {
      label="GPU1"
      sgd_1 [label="SGD Op\nGPU1", shape=box]
      fc_g_1_merged -> sgd_1
      node_w_g1 -> sgd_1
      optimized_w_1 [label="Optimized W\nGPU0"]
      sgd_1 -> optimized_w_1
    }
  }
 }
--- a/doc/design/images/parallel_executor_overview.png
+++ b/doc/design/images/parallel_executor_overview.png
--- a/doc/design/parallel_executor.md
+++ b/doc/design/parallel_executor.md
@ -0,0 +1,104 @@
 # ParallelExecutor
 ## Background
 Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
 The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
 We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. 
 ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
 ## Overview of MultiGPUs logic
 The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
 ![alt](images/parallel_executor_overview.png)
 There are several optimizations for this logic.
 1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
 2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. 
   * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
   * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
 3. The streams of computation, merge gradients and fetch data are different.
 The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
 | Number of GPUs | 1 | 2 | 3 | 4|
 | --- | --- | --- | --- | --- |
 | Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
 | Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
 ## Static single assignment Graph
 [Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
 The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
 The data structure of `SSA` graph is:
 ```c++
 struct VarHandleBase {
  OpHandleBase* generated_op_;
  vector<OpHandleBase*> pending_ops_;
  string name;
  Place place;
  size_t version;
 };
 struct OpHandleBase {
  vector<OpHandleBase*> inputs_;
  vector<OpHnadleBase*> outputs_;
 };
 struct SSAGraph {
  // vars on each devices. 
  //   * the vars in each map in vector is on different device.
  //   * the map is mapping a variable name to variable handles
  //   with different versions
  vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
  // All ops
  vector<OpHandleBase> ops_;
 };
 ```
 The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
 When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
 ## Execute SSA Graph
 The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
 1. Maintaining a map of an operator and its needed input number.
 2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
 3. If there is an operator which needed input number is decreased to zero, just run this operator.
 4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
 Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
 ## Synchronize GPU Kernels
 The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
 1. `OpHandle` will record `DeviceContext` that it is used.
 2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
 The `wait` are implemented by two strategies:
 1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
 2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
 Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
 ## What's next?
 * Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
 * The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
 * A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
 * Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
--- a/doc/fluid/build_and_install/build_from_source_cn.rst
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/build_from_source_cn.rst
--- a/doc/fluid/build_and_install/build_from_source_en.rst
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/build_from_source_en.rst
--- a/doc/fluid/build_and_install/docker_install_cn.rst
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/docker_install_cn.rst
--- a/doc/fluid/build_and_install/docker_install_en.rst
+++ b/doc/fluid/build_and_install/docker_install_en.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/docker_install_en.rst
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
@ -1,2 +0,0 @@
 安装与使用
 ------------
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/index_cn.rst
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
@ -1,2 +0,0 @@
 Build and Install
 ------------
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/index_en.rst
--- a/doc/fluid/build_and_install/pip_install_cn.rst
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/pip_install_cn.rst
--- a/doc/fluid/build_and_install/pip_install_en.rst
+++ b/doc/fluid/build_and_install/pip_install_en.rst
@ -0,0 +1 @@
 ../../v2/build_and_install/pip_install_en.rst
--- a/doc/fluid/design/algorithm/index_cn.rst
+++ b/doc/fluid/design/algorithm/index_cn.rst
@ -0,0 +1,7 @@
 梯度更新算法
 ------------
 .. toctree::
  :maxdepth: 1
  parameter_average.md
--- a/doc/fluid/design/algorithm/index_en.rst
+++ b/doc/fluid/design/algorithm/index_en.rst
@ -0,0 +1,7 @@
 Gradient Update Algorithm
 --------------------------------------
 .. toctree::
  :maxdepth: 1
  parameter_average.md
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@ -2,7 +2,7 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su
 Here are some initial thoughts. Your comments are welcome!
-### Required CMake Function
+# Required CMake Function
 I think we need only the following few CMake functions to make a project description mean and clean:
@ -25,7 +25,7 @@ Also,
 - to describe external dependencies, we need `external_library`.
 - to build shared libraries, we need `shared_library`.
-### An Example Project
+## An Example Project
 Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
@ -102,11 +102,11 @@ shared_library(api
 ```
-### Implementation
+## Implementation
 As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
-### Using Package Manager For Go
+## Using Package Manager For Go
 Building Go binaries and libraries need to satisfy their dependencies, generally
 we can do `go get ./...` to download and compile all external dependencies. The
@ -122,7 +122,7 @@ problems are:
   at many cloud file hosting, so users what to compile paddle by themselves can
   download this "vendor" package from a mirror site.
-#### Choose A Suitable Tool
+### Choose A Suitable Tool
 As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
 list dozens of Go package managers. We choose the tool using following principles:
@ -140,7 +140,7 @@ management tool has been started at: https://github.com/golang/dep to resolve
 such problems, but it's currently at Alpha stage. So the best choice now is
 glide obviously.
-#### Manage Go Packages
+### Manage Go Packages
 - Dependencies: `go/glide.yaml` will store the dependencies and their versions which
  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e
 To create and invoke readers, some new ops are introduced:
-### CreateReaderOp
+### Operators That Create Readers
 Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader)
 The forwarding ops of the corresponding `main_program` would be like this:
 ```
-while_op {
+not_completed = true
 pass_count = 0
 while_op(not_completed) {
    has_next = has_next_op(double_buffer_reader)
    if_else_op(has_next) {
        batch_data = read_op(double_buffer_reader)
        ... (subsequent training ops)
    } else {
        reset_op(double_buffer_reader)
        increase_op(pass_count)
        not_completed = less_than_op(pass_count, reqiured_pass_num)
    }
 }
 ```
-Two important considerations for these programs are as follows:
+A few important considerations for these programs are as follows:
-1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
-2. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
 3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
 ### Simplify Configuration by MultiPassReader
 The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
 `MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
 With `MultiPassReader`, the startup program would be like this:
 ```
 multiple_reader = open_files_op(...)
 batch_reader = create_batch_reader_op(multiple_reader)
 multi_pass_reader = create_multi_pass_reader_op(batch_reader)
 double_buffer_reader = create_double_buffer_op(multi_pass_reader)
 ... (other initializers)
 ```
 The forwarding part of the corresponding `main_program` would be like this:
 ```
 not_completed = true
 while_op(not_completed) {
    batch_data = read_op(double_buffer_reader)
    ... (subsequent training ops)
    not_completed = has_next_op(double_buffer_reader)
 }
 ```
--- a/doc/fluid/design/concepts/index_cn.rst
+++ b/doc/fluid/design/concepts/index_cn.rst
@ -0,0 +1,18 @@
 核心概念
 -------------
 .. toctree::
  :maxdepth: 1
  README.md
  cpp_data_feeding.md
  functions_operators_layers.md
  program.md
  variable.md
  var_desc.md
  tensor.md
  tensor_array.md
  lod_tensor.md
  block.md
  scope.md
  executor.md
--- a/doc/fluid/design/concepts/index_en.rst
+++ b/doc/fluid/design/concepts/index_en.rst
@ -0,0 +1,18 @@
 Core Concepts
 --------------------------------------
 .. toctree::
  :maxdepth: 1
  README.md
  cpp_data_feeding.md
  functions_operators_layers.md
  program.md
  variable.md
  var_desc.md
  tensor.md
  tensor_array.md
  lod_tensor.md
  block.md
  scope.md
  executor.md
--- a/doc/fluid/design/concepts/scope.md
+++ b/doc/fluid/design/concepts/scope.md
@ -30,7 +30,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
-1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is
 A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
-# Interface Design
+## Interface Design
 ```cpp
 class Variable {
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@ -1,3 +1,5 @@
 # Design Doc: Var_desc
 ## Background
 PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
@ -0,0 +1,139 @@
 # Channel Design
 ## Introduction
 A Channel is a data structure that allows for synchronous interprocess 
 communication via message passing.  It is a fundemental component of CSP
 (communicating sequential processes), and allows for users to pass data
 between threads without having to worry about synchronization.
 ## How to use it
 Paddle offers python APIs to open and close channels, along with sending
 and receiving data to/from a channel.
 ### Create a channel
 Creates a new channel that takes in variables of a specific dtype.
 - **fluid.make_channel(dtype, capacity=0)**
  - **dtype**: The data type of variables being sent/received through channel
  - **capacity**: The capacity of the channel.  A capacity of 0 represents 
    an unbuffered channel.  Capacity > 0 represents a buffered channel
 ```
 ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
 ```
 ### Close a channel
 Closes a channel.  Any pending senders and receivers will be awoken during
 this time.  Receivers can still receive from a closed channel, but senders
 are not allowed to send any additional data to the channel (Paddle will
 raise an exception if users try to send to a closed channel.)
 - **fluid.channel_close(channel)**
 ```
 fluid.channel_close(ch)
 ```
 ### Send data to a channel
 Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`, 
 `LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and 
 `ChannelHolder` are supported.
 By default, the data of the Variable is moved from the sender to the receiver,
 however the user can optionally copy the data before performing the send.
 - **channel_send(channel, variable, is_copy=False)**
  - **channel**: The channel to send the variable to
  - **variable**: The variable to send to the channel
  - **is_copy**: If set to True, channel_send will perform a variable assign
  to copy the source variable to a new variable to be sent.
 ```
 ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
 fluid.channel_send(ch, var, True)
 ```
 ### Receive data from a channel
 Receives a variable from a channel.  The data of the variable is moved to the
 receiving variable.
 - **channel_recv(channel, return_variable)**
  - **channel**: The channel to receive the variable from
  - **return_variable**: The destination variable used to store the data of the
  variable received from the channel
 ```
 ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
 fluid.channel_recv(ch, var)
 ```
 ## How it Works
 Channels provides a simple interface for different threads to share data.
 To support the synchronization requirements, channels utilizes a series of
 internal queues, locks, and conditional variables.
 ### QueueMessage
 QueueMessage encapsulates the state of the channel send/receive operation to be
 put in the **sendq/recvq**.  It contains a condition variable used to lock the 
 thread (when there are no available sends/receives).  In addition, it contains
 a callback function to notify a thread when the QueueMessage is being 
 processed by the channel.
 ### Queues
 - **buff_**: This queue holds the data buffer in a buffered channel.  The
 capacity is set to the capacity of the channel.  This data buffer is not
 used in an unbuffered channel.
 - **sendq**: This queue holds the QueueMessage of any pending senders of a
 channel.  When a thread performs a channel_send operation on the channel, the
 channel_send operation will put a new QueueMessage on the sendq and block the
 current thread under two conditions:
  1. The channel is buffered and is full
  2. The channel is unbuffered and does not have a receiver
 - **recvq**:  This queue holds the QueueMessage of any pending receivers of a
 channel.  When a thread performs a channel_recv operation on the channel, the
 channel_recv operation will put a new QueueMessage on the recvq and block the
 current thread under two conditions:
  1. The channel is buffered and there is no data on the buff_
  2. The channel is unbuffered and does not have a sender
 ### State diagram
 #### Channel Send
 <p align="center">
 <img src="./images/channel_send.png"/><br/>
 </p>
 #### Channel Receive
 <p align="center">
 <img src="./images/channel_recv.png"/><br/>
 </p>
 ## Limitations and Considerations
 ### Variable Copy
 In golang, variables in channels are copied from the sender to the receiver.
 In Paddle, the data from our variables are **moved** from sender to receiver.
 As a result, these variables should not be used after they are sent.  We
 provide a flag in channel_send method to allow users to copy the variable to
 be sent before it is sent.  
 Please note that this is acheived by adding an **assign** operator and creating
 a temporary variable that is sent in place of the original variable.  Please
 note that **assign** operator has limited support for only certain variables 
 datatypes.
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`../../v2/build_and_install/build_from_source_cn.rst`
		`@ -0,0 +1 @@`
							`../../v2/build_and_install/docker_install_cn.rst`
		`@ -0,0 +1 @@`
							`../../v2/build_and_install/pip_install_cn.rst`