fix conflict

8 years ago · 0c273ff41e
parent 5d98b6f217 5c3b313cab
commit 0c273ff41e
297 changed files with 11550 additions and 4198 deletions
--- a/.clang_format.hook
+++ b/.clang_format.hook
@ -0,0 +1,15 @@
 #!/bin/bash
 set -e
 readonly VERSION="3.8"
 version=$(clang-format -version)
 if ! [[ $version == *"$VERSION"* ]]; then
    echo "clang-format version check failed."
    echo "a version contains '$VERSION' is needed, but get '$version'"
    echo "you can install the right version, and make an soft-link to '\$PATH' env"
    exit -1
 fi
 clang-format $@
--- a/.gitignore
+++ b/.gitignore
@ -24,4 +24,5 @@ cmake-build-*
 python/paddle/v2/framework/core.so
 CMakeFiles
 cmake_install.cmake
-
+paddle/.timestamp
 python/paddlepaddle.egg-info/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -19,10 +19,10 @@
    -   id: end-of-file-fixer
 -   repo: local
    hooks:
-    -   id: clang-format
+    -   id: clang-format-with-version-check
        name: clang-format
        description: Format files with ClangFormat.
-        entry: clang-format -i
+        entry: bash ./.clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
--- a/.travis.yml
+++ b/.travis.yml
@ -37,8 +37,8 @@ before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
-  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
+  - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - pip install rarfile nltk==3.2.2 scipy==0.19.0 recordio matplotlib Pillow
+  - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - go get -u github.com/alecthomas/gometalinter
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -14,8 +14,8 @@
 cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
+set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})
+set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 include(system)
@ -55,6 +55,7 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -121,8 +122,8 @@ include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
-include_directories("${PROJ_ROOT}")
+include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PROJ_ROOT}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 include_directories(${Boost_INCLUDE_DIRS})
@ -137,14 +138,14 @@ set(EXTERNAL_LIBS
 )
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
    endif(NOT WITH_DSO)
 endif(WITH_GPU)
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
 endif()
 if(USE_NNPACK)
@ -164,10 +165,12 @@ if(WITH_GOLANG)
    add_subdirectory(go)
 endif(WITH_GOLANG)
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 add_subdirectory(paddle)
 if(WITH_PYTHON)
  add_subdirectory(python)
 endif()
 if(WITH_DOC)
    add_subdirectory(doc)
 endif()
--- a/21
+++ b/21
@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 ENV HOME /root
 # Add bash enhancements
@ -34,9 +32,6 @@ RUN apt-get update && \
    net-tools && \
    apt-get clean -y
 # paddle is using numpy.flip, which is introduced since 1.12.0
 RUN pip --no-cache-dir install 'numpy>=1.12.0'
 # Install Go and glide
 RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
    tar -xz -C /usr/local && \
@ -58,19 +53,23 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
 RUN pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel && \
    pip install -U wheel pillow BeautifulSoup && \
    pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark
-    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
+
 RUN pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install rarfile
+    pip install opencv-python
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip install certifi urllib3[secure]
 # Install woboq_codebrowser to /woboq
 RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
    (cd /woboq \
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 if(USE_EIGEN_FOR_BLAS)
    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)
 if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@ -129,7 +133,7 @@ if(WITH_GOLANG)
    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
      COMMAND touch ${CMAKE_BINARY_DIR}/glide
-      DEPENDS ${PROJ_ROOT}/go/glide.lock
+      DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
      )
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@ -52,7 +52,7 @@ macro(add_style_check_target TARGET_NAME)
        if(SOURCES_LIST)
            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-                COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
                        "--filter=${STYLE_FILTER}"
                        ${SOURCES_LIST}
                COMMENT "cpplint: Checking source code style"
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -2,7 +2,7 @@ if(NOT WITH_GPU)
    return()
 endif()
-set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
+set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -51,7 +51,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -73,10 +73,18 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
 IF(${CBLAS_PROVIDER} MATCHES MKL)
    ADD_LIBRARY(cblas SHARED ${dummyfile})
 ELSE()
    ADD_LIBRARY(cblas STATIC ${dummyfile})
 ENDIF()
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 IF(NOT ${CBLAS_FOUND})
    ADD_DEPENDENCIES(cblas extern_openblas)
    LIST(APPEND external_project_dependencies cblas)
 ELSE()
    IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
        ADD_DEPENDENCIES(cblas mklml)
    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -9,11 +9,6 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
        # Use Debug mode instead for now.
        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
@ -158,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -411,7 +411,7 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
             python2 ${py_test_SRCS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
 set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
-set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
 include (CMakePackageConfigHelpers)
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -141,8 +141,8 @@ endmacro()
 function(create_resources res_file output_file)
  add_custom_command(
    OUTPUT ${output_file}
-    COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file}
+    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py)
+    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@ -4,7 +4,7 @@ set(tmp_version "HEAD")
 while ("${PADDLE_VERSION}" STREQUAL "")
  execute_process(
    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
-    WORKING_DIRECTORY ${PROJ_ROOT}
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
    OUTPUT_VARIABLE GIT_TAG_NAME
    RESULT_VARIABLE GIT_RESULT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/doc/about/index_cn.md
+++ b/doc/about/index_cn.md
@ -1,11 +0,0 @@
 关于PaddlePaddle
 ================
 PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
 PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
 同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
 致谢
 --------
 在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
@ -1,14 +0,0 @@
 ABOUT
 =======
 PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
 which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
 PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
 We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
 Credits
 --------
 We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -257,6 +257,11 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
    :noindex:
 seq_slice
 ---------
 ..  autoclass:: paddle.v2.layer.seq_slice
    :noindex:
 kmax_sequence_score
 -------------------
 ..  autoclass:: paddle.v2.layer.kmax_sequence_score
@ -362,6 +367,11 @@ trans
 ..  autoclass:: paddle.v2.layer.trans
    :noindex:
 scale_shift
 -----------
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:
 Sampling Layers
 ===============
@ -409,9 +419,14 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:
-huber_cost
+huber_regression_cost
----------
+-------------------------
-..  autoclass:: paddle.v2.layer.huber_cost
+..  autoclass:: paddle.v2.layer.huber_regression_cost
    :noindex:
 huber_classification_cost
 -------------------------
 ..  autoclass:: paddle.v2.layer.huber_classification_cost
    :noindex:
 lambda_cost
--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
@ -0,0 +1,146 @@
 ## Auto Gradient Checker Design
 ## Backgraound：
 - Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right:
  - 1. you should get the right backpropagation formula according to the forward computation.
  - 2. you should implement it right in CPP.
  - 3. it's difficult to prepare test data.
 - Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
  - 1. numeric gradient checker only need forward operator.
  - 2. user only need to prepare the input data for forward Operator.
 ## Mathematical Theory
 The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful.
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
 ## Numeric Gradient Implementation
 ### Python Interface
 ```python
 def get_numeric_gradient(op,
                         input_values,
                         output_name,
                         input_to_check,
                         delta=0.005,
                         local_scope=None):
    """
    Get Numeric Gradient for an operator's input.
    :param op: C++ operator instance, could be an network
    :param input_values: The input variables. Should be an dictionary, key is
    variable name. Value is numpy array.
    :param output_name: The final output variable name.
    :param input_to_check: The input variable need to get gradient.
    :param delta: The perturbation value for numeric gradient method. The
    smaller delta is, the more accurate result will get. But if that delta is
     too small, it could occur numerical stability problem.
    :param local_scope: The local scope used for get_numeric_gradient.
    :return: The gradient array in numpy format.
    """
 ```
 ### Explaination:
 - Why need `output_name`
  - One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate.
 - Why need `input_to_check`
  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
 ### Core Algorithm Implementation
 ```python
    # we only compute gradient of one element each time.
    # we use a for loop to compute the gradient of every element.
    for i in xrange(tensor_size):
        # get one input element throw it's index i.
        origin = tensor_to_check.get_float_element(i)
        # add delta to it, run op and then get the sum of the result tensor.
        x_pos = origin + delta
        tensor_to_check.set_float_element(i, x_pos)
        y_pos = get_output()
        # plus delta to this element, run op and get the sum of the result tensor.
        x_neg = origin - delta
        tensor_to_check.set_float_element(i, x_neg)
        y_neg = get_output()
        # restore old value
        tensor_to_check.set_float_element(i, origin)
        # compute the gradient of this element and store it into a numpy array.
        gradient_flat[i] = (y_pos - y_neg) / delta / 2
    # reshape the gradient result to the shape of the source tensor.
    return gradient_flat.reshape(tensor_to_check.get_dims())
 ```
 ## Auto Graident Checker Framework
 Each Operator Kernel has three kinds of Gradient:
 - 1. Numeric Gradient
 - 2. CPU Operator Gradient
 - 3. GPU Operator Gradient(if supported)
 Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value.
 - 1. calculate the numeric gradient.
 - 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient.
 - 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU)
 #### Python Interface
 ```python
    def check_grad(self,
                   forward_op,
                   input_vars,
                   inputs_to_check,
                   output_name,
                   no_grad_set=None,
                   only_cpu=False,
                   max_relative_error=0.005):
        """
        :param forward_op: used to create backward_op
        :param input_vars: numpy value of input variable. The following
            computation will use these variables.
        :param inputs_to_check: inputs var names that should check gradient.
        :param output_name: output name that used to
        :param max_relative_error: The relative tolerance parameter.
        :param no_grad_set: used when create backward ops
        :param only_cpu: only compute and check gradient on cpu kernel.
        :return:
        """
 ```
 ### How to check if two numpy array is close enough?
 if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative
 ```python
 numeric_grad = ...
 operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
 abs_numeric_grad = numpy.abs(numeric_grad)
 # if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative
 # error.
 abs_numeric_grad[abs_numeric_grad < 1e-3] = 1
 diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad
 max_diff = numpy.max(diff_mat)
 ```
 #### Notes：
 1，The Input data for auto gradient checker should be reasonable to avoid numeric problem.
 #### Refs:
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
--- a/doc/design/cluster_train/README.md
+++ b/doc/design/cluster_train/README.md
@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below:
 <img src="src/paddle-task-states.png"/>
 1. When a new pass of training starts, all tasks will be placed in the todo queue.
-1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
+1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
-1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer.
+1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
-1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
+1. If a task fails for any reason in trainer, or takes longer than a specific period of time,  the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
 1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
 ### Trainer Process
 The trainer process will:
- Receive tasks from the master.
+- Request tasks from the master.
- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+- Work on the tasks
 - Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
 ### Parameter Server Process
@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at
 1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
 1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
-1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
+1. Write its ip address to */master/addr* so that trainers can discover it.
-1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
+1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
 When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i
 When the trainer is started by the Kubernetes, it executes the following steps at startup:
-1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
-1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
+1. Finds and watches */master/addr* to get master's address.
-1. Waits for tasks from the master to start training.
+1. Requests for tasks from the master to start training.
-If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again.
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
 When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
 ### Parameter Server Process
--- a/doc/design/cluster_train/large_model_dist_train.md
+++ b/doc/design/cluster_train/large_model_dist_train.md
@ -0,0 +1,101 @@
 # Alalysis of large model distributed training in Paddle
 ***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
 ## What is it
 We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
 ## How to use
 Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
 Accrodingly, configure your embedding layers like:
 ```python
 SPARSE_REMOTE=True
 w1 = data_layer(name="w1", size=dict_size)
 emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
 w2 = data_layer(name="w2", size=dict_size)
 emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
 ...
 ```
 ## Implementation details
 ```c++
 enum MatType {
  MAT_NORMAL,
  MAT_NORMAL_SHARED,
  MAT_VALUE_SHARED,
  MAT_SPARSE_ROW_IDS,
  MAT_SPARSE_ROW_AUTO_GROW,
  MAT_CACHE_ROW,
  MAT_SPARSE_ROW,
  MAT_SPARSE_ROW_PREFETCH,
  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
 };
 ```
 `MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
 In `trainer_internal.cpp:L93 trainOneBatch`:
 ```c++
  if (config_->getOptConfig().use_sparse_remote_updater()) {
    REGISTER_TIMER("prefetch");
    gradientMachine_->prefetch(inArgs);
    parameterUpdater_->getParametersRemote();
  }
 ```
 When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
 In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
 ```c++
 if (fullSize) {
    ...
 } else {
 getParams = [&] {
    parameterClient_->getParameterSparse(
        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
 };
 applyL1 = [](Parameter& para, real decayRate) {
    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
 };
 }
 ```
 Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
 ```c++
 void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
                                          std::vector<Buffer>& inputBuffers,
                                          SendParameterResponse* response,
                                          std::vector<Buffer>* outputBuffers) {
  (void)inputBuffers;
  auto& buffer = *readWriteBuffer_;
  size_t numReals = 0;
  for (const auto& block : request.blocks()) {
    numReals += getParameterConfig(block).dims(1);
  }
  buffer.resize(numReals);
  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
  ReadLockGuard guard(parameterMutex_);
  size_t offset = 0;
  for (const auto& block : request.blocks()) {
    size_t width = getParameterConfig(block).dims(1);
    Buffer buf = {buffer.data() + offset, width};
    int type = request.send_back_parameter_type();
    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
    offset += width;
  }
 }
 ```
 `getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
 then `getParameterSparse` remote call returns only one row of data to the client.
--- a/doc/design/cluster_train/src/paddle-etcd.graffle
+++ b/doc/design/cluster_train/src/paddle-etcd.graffle
--- a/doc/design/cluster_train/src/paddle-etcd.png
+++ b/doc/design/cluster_train/src/paddle-etcd.png
--- a/Show More
+++ b/Show More