Merge branch 'develop' into release/0.10.0

9 years ago · c810e6a7c0
parent c4b28f47bd 2cb4a9fbce
commit c810e6a7c0
123 changed files with 3259 additions and 765 deletions
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -0,0 +1,28 @@
+| Github account | name |
+|---|---|
+| reyoung | Yang Yu |
+| gangliao | Gang Liao |
+| luotao01 | Tao Luo |
+| jacquesqiao | Long-Fei Qiao |
+| qingqing01 | Qing-Qing Dang |
+| hedaoyuan | Dao-Yuan He |
+| wangyang59 | Yang Wang |
+| QiJune | Jun Qi |
+| tianbingsz | Tian-Bing Xu |
+| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
+| typhoonzero | Yi Wu |
+| backyes | Yan-Fei Wang |
+| pengli09 | Peng Li |
+| livc | Zhao Li |
+| Xreki | Yi-Qun Liu |
+| Yancey1989 | Xu Yan |
+| emailweixu | Wei Xu |
+| wen-bo-yang | Wen-Bo Yang |
+| helinwang | He-Lin Wang |
+| lcy-seso | Ying Cao |
+| Zrachel | Rui-Qing Zhang |
+| Haichao-Zhang | Hai-Chao Zhang |
+| gongweibao | Wei-Bao Gong |
+| lzhao4ever | Liang Zhao |
+| zhouxiao-coder | Xiao Zhou |
+| lipeng-unisound | Peng Li |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,19 +1,19 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License

-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})

 include(system)

@ -50,6 +50,7 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -75,6 +76,13 @@ endif(ANDROID)

 set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")
+
+if (WITH_C_API AND WITH_PYTHON)
+  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
+    "when using C-API. It will give an unpredictable behavior when using a "
+    "different Python interpreter from compiling.")
+endif()
+
 ########################################################################################

 include(external/zlib)      # download, build, install zlib
--- a/5
+++ b/5
@ -46,6 +46,11 @@ RUN pip install --upgrade pip && \
    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'

+# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+RUN apt-get install -y libssl-dev libffi-dev
+RUN pip install certifi urllib3[secure]
+
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
    cd .. && rm -rf cmake-3.4.1
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,8 +1,6 @@
 # Release v0.10.0

-We are glad to release version 0.10.0.  In this version, we are happy to
-release the
-new
+We are glad to release version 0.10.0.  In this version, we are happy to release the new 
 [Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).

 - Our old Python API is kind of out of date.  It's hard to learn and hard to
--- a/56
+++ b/56
@ -1,56 +0,0 @@
-Cao, Ying
-Cheng, Yujuan
-Dang, Qingqing
-Dong, Tengfei
-Du, Dalong
-Feng, Shouqiang
-Gao, Haoyuan
-Han, Baochang
-Han, Jinchen
-Hao, Nanyu
-He, Daoyuan
-He, Zhengyan
-Hou, Jue
-Huang, Chang
-Huang, Zhiheng
-Hu, Na
-Kong, Qi
-Liao, Gang
-Li, Bo
-Li, Jiajie
-Li, Jing
-Li, Lei
-Li, Peng
-Liu, Sheng
-Liu, Yuan
-Li, Yuze
-Luo, Heng
-Luo, Tao
-Lyu, Qin
-Mao, Hongyue
-Qian, Xiaojun
-Qiao, Longfei
-Qi, Jun
-Qin, Duohao
-Shen, Guolong
-Shi, Guangchuan
-Song, Xiang
-Wang, Helin
-Wang, Jiang
-Wang, Yanfei
-Wang, Yi
-Wang, Yong
-Weng, Renliang
-Xu, Tianbing
-Xu, Wei
-Xu, Xingyu
-Yan, Chong
-Yan, Chunwei
-Yang, Yi
-Yu, Yang
-Yu, Yinan
-Zhang, Jian
-Zhang, Ruiqing
-Zhang, Weide
-Zhao, Liang
-Zhou, Jie
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -5,7 +5,7 @@
 # If any cblas implementation found, the following variable will be set.
 #    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
-#    CBLAS_LIBS      # a list of libraries should be linked by paddle. 
+#    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
 #
 # User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
@ -63,11 +63,11 @@ set(ATLAS_LIB_SEARCH_PATHS
        /usr/lib/atlas
        /usr/lib/atlas-base   # special for ubuntu 14.04.
    )
-find_path(ATLAS_INC_DIR NAMES cblas.h 
+find_path(ATLAS_INC_DIR NAMES cblas.h
  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
+find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
  PATHS ${ATLAS_LIB_SEARCH_PATHS})
 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
  PATHS ${ATLAS_LIB_SEARCH_PATHS})
@ -76,11 +76,12 @@ if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
  set(CBLAS_PROVIDER ATLAS)
  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
  set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
-  add_definitions(-DPADDLE_USE_ATLAS)  
+  add_definitions(-DPADDLE_USE_ATLAS)
  message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  set(CBLAS_FOUND ON)
  if(ATLAS_CLAPACK_INC_DIR)
    add_definitions(-DPADDLE_USE_LAPACK)
+    set(CBLAS_INC_DIR ${CBLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
    message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
  endif()
  return()
@ -124,7 +125,7 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/


-set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH 
+set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
  "Folder contains reference-cblas")
 set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
  ${REFERENCE_CBLAS_ROOT}/include
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@ -34,7 +34,7 @@ set(IGNORE_PATTERN
 #
 # first argument: target name to attach
 # rest arguments: source list to check code style.
-# 
+#
 # NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
 macro(add_style_check_target TARGET_NAME)
    if(WITH_STYLE_CHECK)
@ -48,13 +48,17 @@ macro(add_style_check_target TARGET_NAME)
                if(filename MATCHES ${pattern})
                    message(STATUS "DROP LINT ${filename}")
                    set(LINT OFF)
-                endif() 
+                endif()
            endforeach()
            if(LINT MATCHES ON)
-                add_custom_command(TARGET ${TARGET_NAME}
+                get_filename_component(base_filename ${filename} NAME)
+                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
+                add_custom_command(OUTPUT ${CUR_GEN}
                    PRE_BUILD
                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}" ${filename}
+                                "--filter=${STYLE_FILTER}"
+                                "--write-success=${CUR_GEN}" ${filename}
+                    DEPENDS ${filename}
                    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
            endif()
        endforeach()
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -20,7 +20,7 @@ FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 IF(PROTOBUF_FOUND)
    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
-    IF (${PROTOBUF_VERSION} VERSION_LESS "3.1.0")
+    IF ("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
        SET(PROTOBUF_FOUND OFF)
    ENDIF()
 ENDIF(PROTOBUF_FOUND)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -197,3 +197,4 @@ if(CUDA_ARCH)
 endif()

 set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -28,6 +28,11 @@ ELSE(WIN32)
        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
        SET(MACOS_VERSION ${VERSION})
        SET(HOST_SYSTEM "macosx")
+        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
+            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
+                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
+        ENDIF()
    ELSE(APPLE)

        IF(EXISTS "/etc/issue")
--- a/demo/word2vec/api_train_v2.py
+++ b/demo/word2vec/api_train_v2.py
@ -1,27 +1,40 @@
+import gzip
 import math

 import paddle.v2 as paddle

-dictsize = 1953
 embsize = 32
 hiddensize = 256
 N = 5


 def wordemb(inlayer):
-    wordemb = paddle.layer.table_projection(
+    wordemb = paddle.layer.embedding(
        input=inlayer,
        size=embsize,
        param_attr=paddle.attr.Param(
            name="_proj",
            initial_std=0.001,
            learning_rate=1,
-            l2_rate=0, ))
+            l2_rate=0,
+            sparse_update=True))
    return wordemb


 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
    word_dict = paddle.dataset.imikolov.build_dict()
    dict_size = len(word_dict)
    firstword = paddle.layer.data(
@ -57,6 +70,9 @@ def main():
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
                result = trainer.test(
                    paddle.batch(
                        paddle.dataset.imikolov.test(word_dict, N), 32))
@ -65,11 +81,15 @@ def main():
                    result.metrics)

    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
    parameters = paddle.parameters.create(cost)
-    adam_optimizer = paddle.optimizer.Adam(
+    adagrad = paddle.optimizer.AdaGrad(
        learning_rate=3e-3,
        regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
    trainer.train(
        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
        num_passes=30,
--- a/doc/api/v1/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@ -498,6 +498,12 @@ hsigmoid
    :members: hsigmoid
    :noindex:

+smooth_l1_cost
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: smooth_l1_cost
+    :noindex:
+
 Check Layer 
 ============

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -419,6 +419,11 @@ hsigmoid
 ..  autoclass:: paddle.v2.layer.hsigmoid
    :noindex:

+smooth_l1_cost
+--------------
+..  autoclass:: paddle.v2.layer.smooth_l1_cost
+    :noindex:
+
 Check Layer 
 ============

--- a/doc/design/cluster_train/README.md
+++ b/doc/design/cluster_train/README.md
@ -17,12 +17,16 @@ A training job will be created once user asks Paddle cloud to train a model. The

 1. the *master process*, which dispatches tasks to
 1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
-1. one or more *parameter server processes*, where each holds a shard of the global model.
+1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters.

 Their relation is illustrated in the following graph:

 <img src="src/paddle-model-sharding.png"/>

+By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies.
+
+When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model.
+
 ### Master Process

 The master process will:
@ -31,7 +35,7 @@ The master process will:
 - Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.


-#### Task 
+#### Task

 A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.

@ -78,7 +82,7 @@ The communication pattern between the trainers and the parameter servers depends
 - Synchronous Stochastic Gradient Descent (sync-SGD)

 	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
-  
+
 - Asynchronous Stochastic Gradient Descent (async-SGD)

 	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
@ -118,8 +122,6 @@ When the master is started by the Kubernetes, it executes the following steps at
 1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
 1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.

-The master process will kill itself if its etcd lease expires.
-
 When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.

 ### Trainer Process
@ -132,6 +134,8 @@ When the trainer is started by the Kubernetes, it executes the following steps a

 If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.

+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
+
 ### Parameter Server Process

 When the parameter server is started by Kubernetes, it executes the following steps at startup:
@ -140,11 +144,11 @@ When the parameter server is started by Kubernetes, it executes the following st
 1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.

 	The desired number of parameter servers is 3:
-	
+
 	<img src="src/paddle-ps-0.png"/>
-	
+
 	The third parameter server joined:
-	
+
 	<img src="src/paddle-ps-1.png"/>

 1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
@ -153,6 +157,13 @@ When the parameter server is started by Kubernetes, it executes the following st
 If the parameter server's etcd lease expires, the parameter server will kill itself.


+## Parameter Server Checkpointing
+See [here](./checkpointing.md)
+
+## Store and dispatching trainning data
+See [here](./data_dispatch.md)
+
+
 ## Dynamic Scaling

 ### Trainer Scaling
--- a/doc/design/cluster_train/checkpointing.md
+++ b/doc/design/cluster_train/checkpointing.md
@ -0,0 +1,44 @@
+## 模型参数检查点（Checkpointing）
+模型数据检查点的实现，可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像，来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中，可以通过阶段性的保存每个parameter server的数据快照（snapshot）到 ***分布式存储服务*** 达到容灾的目的，比如每隔10分钟最新的快照，并删除更早的快照。在出现单点故障时，只需要恢复这台节点，或者将这台节点迁移到另一个节点并启动即可恢复训练任务。
+
+<img src="src/checkpointing.png" width="500"/>
+
+### 快照保存的设计如下：
+
+说明：
+
+* parameter server在集群中启动后，自动挂载分布式存储目录，并把快照保存到这个目录下。
+* ***注：每个parameter server的检查点各自独立保存，暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点，因为这样做也没法保证消除随机性。***
+
+检查点保存程序流程：
+
+1. 如果满足条件"每隔10分钟"时，parameter server会获取parameters内存的`read_lock`，启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程，则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`，所以在写入快照的过程中，parameter server会暂停参数更新并等待。
+2. parameter server生成一个UUID，向指定的目录中一个新的文件（文件名为此UUID）写入快照数据。在快照写入完成后，计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容：`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。
+3. 删除磁盘目录中不是当前uuid的快照文件。
+4. 释放对paramters内存的锁定，停止保存检查点的线程。
+
+这里需要用户额外注意，在您的实际环境中，训练任务的运行可能会占满trainer和parameter server之间的网络带宽，如果parameter server此时还需要通过网络访问分布式存储以保存快照，可能会造成网络拥塞，而出现阶段性的运行停滞。
+
+### 从快照恢复
+
+在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动，则需要回滚到上一个检查点：
+
+  1. 从etcd中读取节点：`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid
+  1. 从磁盘文件中加载uuid文件名的检查点快照文件，并加载其中的参数
+  1. 如果上面两步出现错误，则使用启动参数定义的初始化方法初始化参数
+  1. 开始提供服务
+
+## TODO List
+### 推测执行/加速执行（TODO）
+在异构集群中，如果存在某些trainer执行速度过慢会影响整体集群的速度（如图中Trainer 1），此时master将负责启动一个新的Trainer（Accelerate Trainer 2），使用同样的训练数据block。哪个trainer先完成block的训练，则把另一个慢速的kill掉。
+
+### 动态扩容/缩容
+目前只考虑动态扩容trainer数量，可以减小系统复杂性。
+
+## 术语
+* model: 指深度学习训练之后得到的所有参数，使用这个神经网络可以完成对新数据的预测
+* parameters: 神经网络中的参数，包括权重w和偏置b。一个神经网络的模型由大量的参数组成
+* shard: 分片，通常指将一个整体拆分成多份的其中的一份。
+* model shard: 将一个神经网络参数拆分成多份，每个shard分别存储在其中一台parameter server之上
+* parameter block: 多个parameter block构成一个model shard
+* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低（（平均故障率*平均故障修复时间）^2）只对特殊在线系统考虑两台以上同时故障的容灾。
--- a/doc/design/cluster_train/data_dispatch.md
+++ b/doc/design/cluster_train/data_dispatch.md
@ -0,0 +1,120 @@
+## 训练数据的存储和分发
+
+### 流程介绍
+生产环境中的训练数据集通常体积很大，并被存储在诸如Hadoop HDFS，Ceph，AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务，包括：
+
+* 数据预处理任务
+* Paddle训练任务
+* 在线模型预测服务
+
+<img src="src/paddle-cloud-in-data-center.png" width="500"/>
+
+在上图中显示了在一个实际生产环境中的应用（人脸识别）的数据流图。生产环境的日志数据会通过实时流的方式（Kafka）和离线数据的方式（HDFS）存储，并在集群中运行多个分布式数据处理任务，比如流式数据处理（online data process），离线批处理（offline data process）完成数据的预处理，提供给paddle作为训练数据。用于也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。
+
+### 训练数据的存储
+
+选择CephFS作为训练数据的存储服务。
+
+在Kubernetes上运行的不同的计算框架，可以通过Volume或PersistentVolume挂载存储空间到每个容器中。
+
+在CephFS存储系统中的公开目录，需要保存一些预置的公开数据集（比如MNIST, BOW, ImageNet数据集等），并且可以被提交的job直接使用。
+
+### 文件预处理
+
+在数据集可以被训练之前，文件需要预先被转换成PaddlePaddle集群内部的存储格式（SSTable）。我们提供两个转换方式：
+
+- 提供给用户本地转换的库，用户可以编写程序完成转换。
+- 用户可以上传自己的数据集，在集群运行MapReduce job完成转换。
+
+转换生成的文件名会是以下格式：
+
+```text
+name_prefix-aaaaa-of-bbbbb
+```
+
+"aaaaa"和"bbbbb"都是五位的数字，每一个文件是数据集的一个shard，"aaaaa"代表shard的index，"bbbbb"代表这个shard的最大index。
+
+比如ImageNet这个数据集可能被分成1000个shard，它们的文件名是：
+```text
+imagenet-00000-of-00999
+imagenet-00001-of-00999
+...
+imagenet-00999-of-00999
+```
+
+#### 转换库
+
+无论是在本地或是云端转换，我们都提供Python的转换库，接口是：
+```python
+def convert(output_path, reader, num_shards, name_prefix)
+```
+
+- `output_path`: directory in which output files will be saved.
+- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances.
+- `num_shards`: the number of shards that the dataset will be partitioned into.
+- `name_prefix`: the name prefix of generated files.
+
+`reader`每次输出一个data instance，这个instance可以是单个值，或者用tuple表示的多个值：
+
+```python
+yield 1 # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28) # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+```
+
+每个值的类型可以是整形、浮点型数据、字符串，或者由它们组成的list，以及numpy.ndarray。如果是其它类型，会被Pickle序列化成字符串。
+
+### 示例程序
+
+#### 使用转换库
+
+以下`reader_creator`生成的`reader`每次输出一个data instance，每个data instance包涵两个值：numpy.ndarray类型的值和整型的值：
+```python
+def reader_creator():
+	def reader():
+		for i in range(1000):
+			yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+	return reader
+```
+
+把`reader_creator`生成的`reader`传入`convert`函数即可完成转换：
+```python
+convert("./", reader_creator(), 100, random_images)
+```
+
+以上命令会在当前目录下生成100个文件：
+```text
+random_images-00000-of-00099
+random_images-00001-of-00099
+...
+random_images-00099-of-00099
+```
+
+#### 进行训练
+
+PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc)，生成给定SSTable文件对应的data reader。**无论在本地还是在云端，reader的使用方式都是一致的**：
+
+```python
+# ...
+reader = paddle.reader.creator.SSTable("/home/random_images-*-of-*")
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+trainer.train(batch_reader, ...)
+```
+
+以上代码的reader输出的data instance与生成数据集时，reader输出的data instance是一模一样的。
+
+### 上传训练文件
+
+使用下面命令，可以把本地的数据上传到存储集群中。
+
+```bash
+paddle cp filenames pfs://home/folder/
+```
+
+比如，把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令：
+```bash
+paddle cp random_images-*-of-* pfs://home/
+```
+## TODO
+
+### 支持用户自定义的数据预处理job
--- a/doc/design/cluster_train/src/checkpointing.png
+++ b/doc/design/cluster_train/src/checkpointing.png
--- a/doc/design/cluster_train/src/data_dispatch.png
+++ b/doc/design/cluster_train/src/data_dispatch.png
--- a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png
+++ b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png
--- a/doc/design/cluster_train/src/paddle-etcd.graffle
+++ b/doc/design/cluster_train/src/paddle-etcd.graffle
--- a/doc/design/cluster_train/src/paddle-etcd.png
+++ b/doc/design/cluster_train/src/paddle-etcd.png
--- a/doc/design/cluster_train/src/paddle-model-sharding.graffle
+++ b/doc/design/cluster_train/src/paddle-model-sharding.graffle
--- a/doc/design/cluster_train/src/paddle-model-sharding.png
+++ b/doc/design/cluster_train/src/paddle-model-sharding.png
--- a/doc/design/cluster_train/src/paddle-ps-0.png
+++ b/doc/design/cluster_train/src/paddle-ps-0.png
--- a/doc/design/cluster_train/src/paddle-ps-1.png
+++ b/doc/design/cluster_train/src/paddle-ps-1.png
--- a/Show More
+++ b/Show More