Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into Add_pool_op

8 years ago · 30a586df0c
parent dfc8d3c1c1 0cc85d794a
commit 30a586df0c
103 changed files with 4007 additions and 844 deletions
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 def process(settings, file_list):
    for i in xrange(1024):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class)
+        lab = random.randint(0, settings.num_class - 1)
        yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@ -0,0 +1,48 @@
 set -e
 unset OMP_NUM_THREADS MKL_NUM_THREADS
 export OMP_DYNAMIC="FALSE"
 export KMP_AFFINITY="granularity=fine,compact,0,0"
 function train() {
  topology=$1
  bs=$2
  use_mkldnn=$3
  if [ $3 == "True" ]; then
    thread=1
    log="logs/${topology}-mkldnn-${bs}.log"
  elif [ $3 == "False" ]; then
    thread=`nproc`
    log="logs/${topology}-${thread}mklml-${bs}.log"
  else
    echo "Wrong input $3, use True or False."
  fi
  args="batch_size=${bs}"
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
    --use_mkldnn=$use_mkldnn \
    --use_gpu=False \
    --trainer_count=$thread \
    --log_period=10 \
    --test_period=100 \
    --config_args=$args \
    2>&1 | tee ${log} 
 }
 if [ ! -d "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
 #========== mkldnn ==========#
 train vgg 64 True
 train vgg 128 True
 train vgg 256 True
 #========== mklml ===========#
 train vgg 64 False
 train vgg 128 False
 train vgg 256 False
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -0,0 +1,103 @@
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *
 height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
 define_py_data_sources2(
    "train.list", None, module="provider", obj="process", args=args)
 settings(
    batch_size=batch_size,
    learning_rate=0.01 / batch_size,
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))
 img = data_layer(name='image', size=height * width * 3)
 def vgg_network(vgg_num=3):
    tmp = img_conv_group(
        input=img,
        num_channels=3,
        conv_padding=1,
        conv_num_filter=[64, 64],
        conv_filter_size=3,
        conv_act=ReluActivation(),
        pool_size=2,
        pool_stride=2,
        pool_type=MaxPooling())
    tmp = img_conv_group(
        input=tmp,
        conv_num_filter=[128, 128],
        conv_padding=1,
        conv_filter_size=3,
        conv_act=ReluActivation(),
        pool_stride=2,
        pool_type=MaxPooling(),
        pool_size=2)
    channels = []
    for i in range(vgg_num):
        channels.append(256)
    tmp = img_conv_group(
        input=tmp,
        conv_num_filter=channels,
        conv_padding=1,
        conv_filter_size=3,
        conv_act=ReluActivation(),
        pool_stride=2,
        pool_type=MaxPooling(),
        pool_size=2)
    channels = []
    for i in range(vgg_num):
        channels.append(512)
    tmp = img_conv_group(
        input=tmp,
        conv_num_filter=channels,
        conv_padding=1,
        conv_filter_size=3,
        conv_act=ReluActivation(),
        pool_stride=2,
        pool_type=MaxPooling(),
        pool_size=2)
    tmp = img_conv_group(
        input=tmp,
        conv_num_filter=channels,
        conv_padding=1,
        conv_filter_size=3,
        conv_act=ReluActivation(),
        pool_stride=2,
        pool_type=MaxPooling(),
        pool_size=2)
    tmp = fc_layer(
        input=tmp,
        size=4096,
        act=ReluActivation(),
        layer_attr=ExtraAttr(drop_rate=0.5))
    tmp = fc_layer(
        input=tmp,
        size=4096,
        act=ReluActivation(),
        layer_attr=ExtraAttr(drop_rate=0.5))
    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
 if layer_num == 16:
    vgg = vgg_network(3)
 elif layer_num == 19:
    vgg = vgg_network(4)
 else:
    print("Wrong layer number.")
 lab = data_layer('label', num_class)
 loss = cross_entropy(input=vgg, label=lab)
 outputs(loss)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
  endforeach()
  list(REMOVE_DUPLICATES libs_deps)
-  if(APPLE) # Use OSX's libtool to merge archives
+  # To produce a library we need at least one source file.
-    # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
-    # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
-    # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
  if(APPLE) # Use OSX's libtool to merge archives
    # Make the generated dummy source file depended on all static input
    # libs. If input lib changes,the source file is touched
    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${dummyfile}
+    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
      DEPENDS ${libs})
    # Generate dummy staic lib
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
    target_link_libraries(${TARGET_NAME} ${libs_deps})
    foreach(lib ${libs})
@ -130,11 +130,14 @@ function(merge_static_libs TARGET_NAME)
    endforeach()
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
      )
  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
    foreach(lib ${libs})
-      set(objlistfile ${lib}.objlist) # list of objects in the input library
+      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
-      set(objdir ${lib}.objdir)
+      set(objdir ${target_DIR}/${lib}.objdir)
      add_custom_command(OUTPUT ${objdir}
        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
@ -142,31 +145,32 @@ function(merge_static_libs TARGET_NAME)
      add_custom_command(OUTPUT ${objlistfile}
        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})
-      # Empty dummy source file that goes into merged library		
+      list(APPEND target_OBJS "${objlistfile}")
      set(mergebase ${lib}.mergebase.c)		
      add_custom_command(OUTPUT ${mergebase}		
        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
        DEPENDS ${objlistfile})		
      list(APPEND mergebases "${mergebase}")
    endforeach()
-    add_library(${TARGET_NAME} STATIC ${mergebases})
+    # Make the generated dummy source file depended on all static input
    # libs. If input lib changes,the source file is touched
    # which causes the desired effect (relink).
    add_custom_command(OUTPUT ${target_SRCS}
      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
      DEPENDS ${libs} ${target_OBJS})
    # Generate dummy staic lib
    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
    add_library(${TARGET_NAME} STATIC ${target_SRCS})
    target_link_libraries(${TARGET_NAME} ${libs_deps})
    # Get the file name of the generated library
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
-    foreach(lib ${libs})
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
-        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
+        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
-        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${target_DIR})
        WORKING_DIRECTORY ${lib}.objdir)
    endforeach()
  endif()
 endfunction(merge_static_libs)
@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
  else(cc_library_SRCS)
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
    else()
      message(FATAL "Please specify source file or library in cc_library.")
@ -249,7 +253,7 @@ function(nv_library TARGET_NAME)
      foreach(source_file ${nv_library_SRCS})
        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        endif()
      endforeach()
      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -97,6 +97,10 @@ function(link_paddle_exe TARGET_NAME)
        target_link_libraries(${TARGET_NAME} log)
    endif(ANDROID)
    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@ -0,0 +1,222 @@
 # Design Doc: Distributed Training Architecture
 ## Abstract
 PaddlePaddle v0.10.0 uses the "trainer-parameter server"
 architecture. We run multiple replicated instances of trainers (runs
 the same code written by the user) and parameter servers for
 distributed training. This architecture served us well, but has some
 limitations:
 1. Need to write special code to handle tasks which should only be run
  by a single trainer. E.g., initializing model and saving model.
 2. Model parallelism is hard: need to write if-else branches conditioned
  on the trainer ID to partition model onto each trainer, and manually
  write the inter-model-shard communication code.
 3. The user can not directly specify the parameter update rule: need
   to modify the parameter server C++ code and compile a new
   binary. This adds complication for researchers: A lot of extra
   effort is required. Besides, the training job submission program
   may not allow running arbitrary binaries.
 This design doc discusses PaddlePaddle's new distributed training
 architecture that addresses the above limitations.
 ## Analysis
 We will assume the user writes the trainer program by Python, the same
 analysis holds if the trainer program is written in C++.
 ### Limitation 1
 If we look at the Python code that the user writes, there are two
 kinds of functionalities:
 - The training logic such as load / save model and print log.
 - The neural network definition such as the definition of the data
  layer, the fully connected layer, the cost function and the
  optimizer.
 When we training with PaddlePaddle v0.10.0 distributedly, multiple
 replicated Python instances are running on different nodes: both the
 training logic and the neural network computation is replicated.
 The tasks that should only run once all belong to the training logic,
 if we only replicate the neural network computation, but do **not**
 replicate the training logic, the limitation could be solved.
 ### Limitation 2
 Model parallelism means running a single model on multiple nodes by
 partitioning the model onto different nodes and managing the
 inter-model-shard communications.
 PaddlePaddle should be able to modify the nerual network computation
 definition to support model parallelism automatically. However, the
 computation is only specified in Python code, and PaddlePaddle can not
 modify Python code.
 Just like compiler uses a intermediate representation (IR) so that
 programmer does not need to manually optimize their code in most of
 the cases - the compiler will optimize the IR:
 <img src="src/compiler.png"/>
 We can have our own IR too: PaddlePaddle can support model parallel by
 converting the IR so the user no longer need to manually do it in
 Python:
 <img src="src/paddle-compile.png"/>
 The IR for PaddlePaddle after refactor is called `Block`, it specifies
 the computation dependency graph and the variables used in the
 computation.
 ### Limitation 3
 The user can not directly specify the parameter update rule for the
 parameter server because the parameter server does not use the same
 computation definition as the trainer. Instead, the update rule is
 baked in the parameter server. The user can not specify the update
 rule in the same way of specifying the trainer computation.
 This could be fixed by making the parameter server run the same
 computation definition as the trainer. For a detailed explanation,
 please
 see
 [Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
 ## Distributed Training Architecture
 The new distributed training architecture can address the above
 limitations. Below is the illustration:
 <img src="src/distributed_architecture.png"/>
 The architecture includes major components: *PaddlePaddle Python*,
 *PaddlePaddle converter* and *PaddlePaddle runtime*:
 ### PaddlePaddle Python
 PaddlePaddle Python is the Python library that user's Python trainer
 invoke to build the neural network topology, start training, etc.
 ```Python
 paddle.init()
 input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
 img, label = input[0], input[1]
 hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
 prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
 cost = paddle.layer.classification_cost(input=prediction, label=label)
 optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
 session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
 for i in range(1000):
 	_, cost_val = session.eval(targets=[cost, optimizer])
 	print cost_val
 ```
 The code above is a typical Python trainer code, the neural network
 topology is built using helper functions such as
 `paddle.layer.fc`. The training is done by calling `session.eval`
 iteratively.
 #### session.eval
 As shown in the graph, `session.eval` sends the IR and the evaluation
 inputs/targets to the PaddlePaddle cluster for evaluation. The
 targets can be any variable in the computation graph. When the target
 is the `optimizer` variable, the neural network will be optimized
 once. When the target is the `cost` variable, `session.eval` returns
 the cost value.
 The Python `session` is a wrapper of the C++ `Session` class. For more
 information about `Session`, please
 see [Design Doc: Session](./session.md).
 ### PaddlePaddle Converter
 PaddlePaddle converter automatically converts the IR in the request
 (IR and evaluation inputs/targets) from PaddlePaddle Python to new
 partitioned IRs and dispatch the new IRs and evaluation inputs/targets
 to different PaddlePaddle runtimes. Below are the steps:
 1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
   fetches the eval targets to the IR.
 1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
   the boundary. The runtime does not need to run the OP that is not
   dependent by the `fetch` OP.
 1. Optimizes the computation graph.
 1. Place the OPs in the graph onto different devices on different
   PaddlePaddle runtime according to a placement algorithm and device
   constraint specified by the user.
 1. Partition the graph according to runtime boundaries and add `send` /
   `recv` OP pair on the runtime boundaries.
 1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
 1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
   results back to the converter, the convert reports the evaluation
   results back to the PaddlePaddle Python.
 The output IRs will be cached to optimize the conversion latency.
 #### Placement Algorithm
 Our first implementation will only support "trainer-parameter server"
 placement: the parameters, initializers, and optimizers are placed on
 the PaddlePaddle runtimes with the parameter server role. And
 everything else will be placed on the PaddlePaddle runtimes with the
 trainer role. This has the same functionality of our
 "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
 is more general and flexible.
 In the future, we will implement the general placement algorithm,
 which makes placements according to the input IR, and a model of
 device computation time and device communication time. Model
 parallelism requires the general placement algorithm.
 ### PaddlePaddle Runtime
 The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
 runs the IR. The runtime does not need to do OP placement since it's
 already done by the converter.
 ### Local Training Architecture
 The local training architecture will be the same as the distributed
 training architecture, the differences are everything runs locally,
 and there is just one PaddlePaddle runtime:
 <img src="src/local_architecture.png"/>
 ### Training Data
 In PaddlePaddle v0.10.0, training data is typically read
 with [data reader](../reader/README.md) from Python. This approach is
 no longer efficient when training distributedly since the Python
 process no longer runs on the same node with the trainer processes,
 the Python reader will need to read from the distributed filesystem
 (assuming it has the access) and send to the trainers, doubling the
 network traffic.
 When doing distributed training, the user can still use Python data
 reader: the training data are sent with `session.eval`. However should
 be used for debugging purpose only. The users are encouraged to use
 the read data OPs.
 ## References:
 [1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
 [2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
--- a/doc/design/refactor/parameter_server.md
+++ b/doc/design/refactor/parameter_server.md
--- a/doc/design/refactor/src/compiler.graffle
+++ b/doc/design/refactor/src/compiler.graffle
--- a/doc/design/refactor/src/compiler.png
+++ b/doc/design/refactor/src/compiler.png
--- a/doc/design/refactor/src/dist-graph.graffle
+++ b/doc/design/refactor/src/dist-graph.graffle
--- a/doc/design/refactor/src/dist-graph.png
+++ b/doc/design/refactor/src/dist-graph.png
--- a/doc/design/refactor/src/distributed_architecture.graffle
+++ b/doc/design/refactor/src/distributed_architecture.graffle
--- a/doc/design/refactor/src/distributed_architecture.png
+++ b/doc/design/refactor/src/distributed_architecture.png
--- a/doc/design/refactor/src/local-graph.graffle
+++ b/doc/design/refactor/src/local-graph.graffle
--- a/doc/design/refactor/src/local-graph.png
+++ b/doc/design/refactor/src/local-graph.png
--- a/doc/design/refactor/src/local_architecture.graffle
+++ b/doc/design/refactor/src/local_architecture.graffle
--- a/doc/design/refactor/src/local_architecture.png
+++ b/doc/design/refactor/src/local_architecture.png
--- a/doc/design/refactor/src/paddle-compile.graffle
+++ b/doc/design/refactor/src/paddle-compile.graffle
--- a/doc/design/refactor/src/paddle-compile.png
+++ b/doc/design/refactor/src/paddle-compile.png
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@ -390,4 +390,125 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
 * 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
-* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
 19. PaddlePaddle如何输出多个层
 ------------------------------
 * 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
 ..  code-block:: python
    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
 * 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
 ..  code-block:: python
    out = inferer.infer(input=data_batch, flatten_result=False, field=["value"])
 这里设置 :code:`flatten_result=False`，得到的输出结果是元素个数等于输出字段数的 :code:`list`，该 :code:`list` 的每个元素是由所有输出层相应字段结果组成的 :code:`list`，每个字段结果的类型是 :code:`numpy.array`。:code:`flatten_result` 的默认值为 :code:`True`，该情况下，PaddlePaddle会分别对每个字段将所有输出层的结果按行进行拼接，如果各输出层该字段 :code:`numpy.array` 结果的相应维数不匹配，程序将不能正常运行。
 20. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
 -------------------------------------------------------------
 * :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
 * PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
 21. dropout 使用
 -----------------
 * 在PaddlePaddle中使用dropout有两种方式
  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
  ..  code-block:: python
      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
  ..  code-block:: python
      fc = paddle.layer.fc(input=input)
      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
 * :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
 * PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
 * :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
 22. 如何设置学习率退火（learning rate annealing）
 ------------------------------------------------
 在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
 ..  code-block:: python
    optimizer = paddle.optimizer.Adam(
        learning_rate=1e-3,
        learning_rate_decay_a=0.5,
        learning_rate_decay_b=0.75,
        learning_rate_schedule="poly",)
 PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
 * "constant"
  lr = learning_rate
 * "poly"
  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
  其中，num_samples_processed为已训练样本数，下同。
 * "caffe_poly"
  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
 * "exp"
  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
 * "discexp"
  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
 * "linear"
  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
 * "manual"
  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
  ..  code-block:: python
      optimizer = paddle.optimizer.Adam(
          learning_rate=1e-3,
          learning_rate_schedule="manual",
          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
 * "pass_manual"
  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
  ..  code-block:: python
      optimizer = paddle.optimizer.Adam(
          learning_rate=1e-3,
          learning_rate_schedule="manual",
          learning_rate_args="1:1.0,2:0.9,3:0.8",) 
  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
 23. 出现 :code:`Duplicated layer name` 错误怎么办
 --------------------------------------------------
 出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@ -20,7 +20,7 @@ Docker使用入门
     docker pull paddlepaddle/paddle:0.10.0
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
+  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
 - *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@ -0,0 +1,235 @@
 # How to write a new operator
 - [Background](#Background)
 - [Implementing C++ Types](#Implementing_C++_Types)
   - [Defining ProtoMaker](#Defining_ProtoMaker)
   - [Defining Operator](#Defining_Operator)
   - [Registering Operator](#Registering_Operator)
   - [Compilation](#Compilation)
 - [Python Binding](#Python_Binding)
 - [Unit Tests](#Unit_Tests)
 ## Background
 Here are the base types needed. For details, please refer to the design docs.
 - `framework::OperatorBase`: Operator (Op)base class.
 - `framework::OpKernel`: Base class for Op computation.
 - `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
 - `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
 An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
 Information           | Where is it defined
 --------------  | :----------------------
 OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
 Op definition           | `.cc` files
 Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
 Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
 ## Implementing C++ Types
 ### 1. Defining Class ProtoMaker
 Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
 First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
    AddComment(R"DOC(
 Two Element Mul Operator.
 The equation is: Out = X * Y
 )DOC");
  }
 };
 ```
 [`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
   - `framework::OpAttrChecker` is used to validate variable attributes.
 The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
 The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
 An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
 ```cpp
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of scale operator.").NotInGradient();
    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
    AddComment(R"DOC(Scale operator
 The equation is: Out = scale*X
 )DOC");
    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
  }
 };
 ```
 There are two changes in this example:
 - `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
 ### 2. Defining Operator
 The following code defines the interface for MulOp:
 ```cpp
 class MulOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    auto dim0 = ctx.Input<Tensor>("X")->dims();
    auto dim1 = ctx.Input<Tensor>("Y")->dims();
    PADDLE_ENFORCE_EQ(dim0.size(), 2,
                      "input X(%s) should be a tensor with 2 dims, a matrix",
                      ctx.op_.Input("X"));
    PADDLE_ENFORCE_EQ(dim1.size(), 2,
                      "input Y(%s) should be a tensor with 2 dims, a matrix",
                      ctx.op_.Input("Y"));
    PADDLE_ENFORCE_EQ(
        dim0[1], dim1[0],
        "First matrix's width must be equal with second matrix's height.");
    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
  }
 };
 ```
 [`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
 ```
 expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
 ```cpp
 MulOp(const std::string &type, const framework::VariableNameMap &inputs,
      const framework::VariableNameMap &outputs,
      const framework::AttributeMap &attrs)
  : OperatorWithKernel(type, inputs, outputs, attrs) {}
 ```
 `InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
  - 1). validate and error out early: it checks input data dimensions and types.
  - 2). configures the tensor shape in the output.
 Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
 ### 3. Defining OpKernel
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 - `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 - `typename T` denotes data type, such as `float` or `double`.
 `MulKernel` types need to rewrite the interface for `Compute`.
 - `Compute` takes one input variable `const framework::ExecutionContext& context`.
 - Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
 - `Compute` implements the computation logics of an `OpKernel`.
 `MulKernel`'s implementation of `Compute` is as follows:
  ```cpp
  template <typename Place, typename T>
  class MulKernel : public framework::OpKernel {
  public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<Tensor>("X");
    auto* Y = context.Input<Tensor>("Y");
    auto* Z = context.Output<Tensor>("Out");
    Z->mutable_data<T>(context.GetPlace());
    auto* device_context =
        const_cast<platform::DeviceContext*>(context.device_context_);
    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
  }
  };
  ```
 Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
 `MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 To ease the writing of `OpKernel` compute, and for reusing code cross-device, `Eigen unsupported Tensor` module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
 This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
 ### 4. Registering Operator
 - In `.cc` files, register forward and backward operator classes and the CPU kernel.
    ```cpp
    namespace ops = paddle::operators;
    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
    ```
   In that code block,
    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
 - Registering GPU Kernel in `.cu` files
    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
    ```cpp
    // if use Eigen unsupported module before include head files
    #define EIGEN_USE_GPU
    namespace ops = paddle::operators;
    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
    REGISTER_OP_GPU_KERNEL(mul_grad,
                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
    ```
 ### 5. Compilation
 Run the following commands to compile.
 ```
 make mul_op
 ```
 ## Python Binding
 The system will automatically bind to Python and link it to a generated library.
 ## Unit Tests
 Unit tests include comparing a forward operator's implementations on different devices, comparing a backward operator's implementation on different devices, and a scaling test for the backward operator. Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
--- a/doc/survey/cluster_bootstrapping_tools.md
+++ b/doc/survey/cluster_bootstrapping_tools.md
@ -0,0 +1,71 @@
 # Cluster bootstrapping tool survey
 ## Abstract
 In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
 ## Basic assumptions
 Here are some basic assumptions before we move on to  details
 1. You are an administrator of a bare metal machine cluster, which means:
  * you have full control to each of the machines.
  * you have full control to the network which machines are connected to.
 2. Machines can be booted from network with PEX or iPXE
 3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
 if your cluster is able to mark above items with checkmarks, then keep reading.
 ## Comparing Sextant and Tectonic installer
 ### Sextant
 Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. 
 #### Pros
 1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
 2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
 3. docker registry integrated.
 4. GPU machine took care of.
 ### Cons
 1. k8s API server is not deployed with high availability in considering by default.
 2. No grouping support.
 3. No API interface, a one-off service.
 ### Tectonic installer
 First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
 Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, 
 Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
 Matchbox's Approach is similar to Sexstant.
 ### Pros
 1. supports grouping machines.
 2. supports running provisioning service in rtk. (not a big deal though).
 3. supports http/gRPC API interface.
 4. supports multi-template.
 ### Cons
 1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
 2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
 ## Conclusion
 Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
 ## Appendix: General procedure to bring up a cluster
 It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
 1. setup a bootstrap machine with static IP in the cluster, which has following services:
  * DHCP: assigns ip address for rest of the nodes.
  * name service: to map node name to a IP
  * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. 
  * cluster config service: this is for providing cluster node with OS config via http
  * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
 2. New node powers on, it will
  * broadcast the request for an IP address
  * DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
  * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
  * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
  * then restart the node.
 For further understanding, following 2 links from Matchbox are some good readings:
 * [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
 * [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@ -62,6 +62,7 @@ if(ANDROID)
          LIBRARY DESTINATION lib/${ANDROID_ABI})
  execute_process(
    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
    OUTPUT_VARIABLE GIT_COMMITS_LIST
    RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@ -81,8 +82,7 @@ if(ANDROID)
      )"
  )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole
+  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
          ARCHIVE DESTINATION lib)
  if(NOT IOS)
    install(TARGETS paddle_capi_shared DESTINATION lib)
  endif()
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@ -28,41 +28,11 @@ ProgramDesc& GetProgramDesc() {
  return *g_program_desc;
 }
 template <>
 AttrType AttrTypeID<int>() {
  return INT;
 }
 template <>
 AttrType AttrTypeID<float>() {
  return FLOAT;
 }
 template <>
 AttrType AttrTypeID<std::string>() {
  return STRING;
 }
 template <>
 AttrType AttrTypeID<std::vector<int>>() {
  return INTS;
 }
 template <>
 AttrType AttrTypeID<std::vector<float>>() {
  return FLOATS;
 }
 template <>
 AttrType AttrTypeID<std::vector<std::string>>() {
  return STRINGS;
 }
 template <>
 AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
  return INT_PAIRS;
 }
 template <>
 AttrType AttrTypeID<BlockDesc>() {
  return BLOCK;
 }
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
  switch (attr_desc.type()) {
    case framework::AttrType::BOOLEAN: {
      return attr_desc.b();
    }
    case framework::AttrType::INT: {
      return attr_desc.i();
    }
@ -72,6 +42,13 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
    case framework::AttrType::STRING: {
      return attr_desc.s();
    }
    case framework::AttrType::BOOLEANS: {
      std::vector<bool> val(attr_desc.bools_size());
      for (int i = 0; i < attr_desc.bools_size(); ++i) {
        val[i] = attr_desc.bools(i);
      }
      return val;
    }
    case framework::AttrType::INTS: {
      std::vector<int> val(attr_desc.ints_size());
      for (int i = 0; i < attr_desc.ints_size(); ++i) {
@ -93,14 +70,6 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
      }
      return val;
    }
    case framework::AttrType::INT_PAIRS: {
      std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
      for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
        val[i].first = attr_desc.int_pairs(i).first();
        val[i].second = attr_desc.int_pairs(i).second();
      }
      return val;
    }
    case framework::AttrType::BLOCK: {
      return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
    }
--- a/Show More
+++ b/Show More