Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into nce_op

8 years ago · d0246e24e0
parent ea7359c60b 0aceeee1fa
commit d0246e24e0
433 changed files with 12510 additions and 16649 deletions
--- a/.clang-format
+++ b/.clang-format
@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
--- a/.travis.yml
+++ b/.travis.yml
@ -42,7 +42,7 @@ before_install:
 script:
  - |
    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
  - |
    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,8 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@ -82,10 +81,8 @@ if(ANDROID OR IOS)
        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
    set(WITH_RDMA OFF CACHE STRING
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLDNN OFF CACHE STRING
+    set(WITH_MKL OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLML OFF CACHE STRING
        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
@ -111,6 +108,14 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 set(WITH_MKLML ${WITH_MKL})
 if (WITH_MKL AND AVX2_FOUND)
    set(WITH_MKLDNN ON)
 else()
    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
    set(WITH_MKLDNN OFF)
 endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
@ -128,6 +133,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
 include(external/cares)
 include(external/grpc)
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@ -158,14 +165,15 @@ set(EXTERNAL_LIBS
 )
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+  include(cuda)
    if(NOT WITH_DSO)
        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
    endif(NOT WITH_DSO)
 endif(WITH_GPU)
 if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 if(USE_NNPACK)
--- a/2
+++ b/2
@ -29,7 +29,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
    apt-get clean -y
 # Install Go and glide
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@ -12,11 +12,11 @@ Machine:
 System: CentOS release 6.3 (Final), Docker 1.12.1.
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
+PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
-
+- MKL-DNN tag v0.11
- MKL-DNN tag v0.10
+- MKLML 2018.0.1.20171007
 - MKLML 2018.0.20170720
 - OpenBLAS v0.2.20
 (TODO: will rerun after 0.11.0)
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
@ -31,17 +31,37 @@ Input image size - 3 * 224 * 224, Time: images/second
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
+| MKLML        | 12.12 | 13.70 | 16.18  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
 chart on batch size 128
 TBD
 - ResNet-50
 | BatchSize    | 64    | 128   | 256    |
 |--------------|-------| ------| -------|
 | OpenBLAS     | 25.22 | 25.68 | 27.12  | 
 | MKLML        | 32.52 | 31.89 | 33.12  |
 | MKL-DNN      | 81.69 | 82.35 | 84.08  |
 chart on batch size 128
 TBD
 - ResNet
 - GoogLeNet
 | BatchSize    | 64    | 128   | 256    |
 |--------------|-------| ------| -------|
 | OpenBLAS     | 89.52 | 96.97 | 108.25 | 
 | MKLML        | 128.46| 137.89| 158.63 |
 | MKL-DNN      | 250.46| 264.83| 269.50 |
 chart on batch size 128
 TBD
 ### Laptop
 TBD
 ### Desktop
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@ -5,6 +5,7 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
 define_py_data_sources2(
@ -16,6 +17,8 @@ settings(
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))
 conv_projection = conv_projection if use_gpu else img_conv_layer
 def inception2(name, input, channels, \
    filter1,
    filter3R, filter3,
@ -138,7 +141,7 @@ def inception(name, input, channels, \
    cat = concat_layer(
        name=name,
        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
        act=ReluActivation())
    return cat
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@ -1,9 +1,7 @@
 set -e
 function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
  export OMP_DYNAMIC="FALSE"
  export KMP_AFFINITY="granularity=fine,compact,0,0"
  topology=$1
  layer_num=$2
  bs=$3
@ -14,8 +12,6 @@ function train() {
  elif [ $4 == "False" ]; then
    thread=`nproc`
    # each trainer_count use only 1 core to avoid conflict
    export OMP_NUM_THREADS=1
    export MKL_NUM_THREADS=1
    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
    echo "Wrong input $3, use True or False."
@ -44,6 +40,7 @@ fi
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50  $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
  done
 done
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -76,27 +76,14 @@ else()
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
-if(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
-    add_definitions(-DPADDLE_USE_MKLDNN)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
+    set(OPENMP_FLAGS "-fopenmp")
-        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(OPENMP_FLAGS "-fopenmp")
+    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+endif()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
    else()
        find_package(OpenMP)
        if(OPENMP_FOUND)
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
        else()
            message(WARNING "Can not find OpenMP."
                 "Some performance features in MKLDNN may not be available")
        endif()
    endif()
 endif(WITH_MKLDNN)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
 # Set the architecture for iOS
 if(NOT DEFINED IOS_ARCH)
  if(IOS_PLATFORM STREQUAL "OS")
-    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
+    set(IOS_ARCH "armv7;armv7s;arm64")
    set(IOS_ARCH "arm64")
  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    # FIXME(liuyiqun): support "i386;x86_64" future
+    set(IOS_ARCH "i386;x86_64")
    set(IOS_ARCH "x86_64")
  endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
 # Hidden visibilty is required for cxx on iOS 
 set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
 set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -0,0 +1,188 @@
 if(NOT WITH_GPU)
    return()
 endif()
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
 ######################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
 # Usage:
 #   detect_installed_gpus(out_variable)
 function(detect_installed_gpus out_variable)
  if(NOT CUDA_gpu_detect_output)
    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
    file(WRITE ${cufile} ""
      "#include <cstdio>\n"
      "int main() {\n"
      "  int count = 0;\n"
      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
      "  if (count == 0) return -1;\n"
      "  for (int device = 0; device < count; ++device) {\n"
      "    cudaDeviceProp prop;\n"
      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
      "  }\n"
      "  return 0;\n"
      "}\n")
    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
                    "--run" "${cufile}"
                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
    if(nvcc_res EQUAL 0)
      # only keep the last line of nvcc_out
      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
      list(GET nvcc_out -1 nvcc_out)
      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
    endif()
  endif()
  if(NOT CUDA_gpu_detect_output)
    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
  else()
    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
  endif()
 endfunction()
 ########################################################################
 # Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
 # Usage:
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
  set(archs_name_default "All")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND archs_names "Auto")
  endif()
  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
  mark_as_advanced(CUDA_ARCH_NAME)
  # verify CUDA_ARCH_NAME value
  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
    string(REPLACE ";" ", " archs_names "${archs_names}")
    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
  endif()
  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
  else()
    unset(CUDA_ARCH_BIN CACHE)
    unset(CUDA_ARCH_PTX CACHE)
  endif()
  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
    set(cuda_arch_bin "30 35")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
    set(cuda_arch_bin "50")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
    set(cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
    detect_installed_gpus(cuda_arch_bin)
  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(cuda_arch_bin ${CUDA_ARCH_BIN})
  endif()
  # remove dots and convert to lists
  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
  list(REMOVE_DUPLICATES cuda_arch_bin)
  list(REMOVE_DUPLICATES cuda_arch_ptx)
  set(nvcc_flags "")
  set(nvcc_archs_readable "")
  # Tell NVCC to add binaries for the specified GPUs
  foreach(arch ${cuda_arch_bin})
    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
      # User explicitly specified PTX for the concrete BIN
      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
    else()
      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
      list(APPEND nvcc_archs_readable sm_${arch})
    endif()
  endforeach()
  # Tell NVCC to add PTX intermediate code for the specified architectures
  foreach(arch ${cuda_arch_ptx})
    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
    list(APPEND nvcc_archs_readable compute_${arch})
  endforeach()
  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
 endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
 endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
 endif(NOT WITH_DSO)
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
 message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
 # Set C++11 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
 endif()
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@ -0,0 +1,45 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 IF(MOBILE_INFERENCE)
    return()
 ENDIF()
 include (ExternalProject)
 # NOTE: c-ares is needed when linking with grpc.
 SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
 SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
 SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
 ExternalProject_Add(
    extern_cares
    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
    GIT_TAG "cares-1_13_0"
    PREFIX          ${CARES_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
    BUILD_IN_SOURCE 1
    BUILD_COMMAND   make
    INSTALL_COMMAND make install
 )
 ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
             "${CARES_INSTALL_DIR}/lib/libcares.a")
 include_directories(${CARES_INCLUDE_DIR})
 ADD_DEPENDENCIES(cares extern_cares)
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -28,15 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(yiwang): The annoying warnings mentioned in
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
    # to fix it.  Before it gets accepted by the gflags team, we use
    # my personal fork, which contains above fix, temporarily.  Let's
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -0,0 +1,58 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 IF(MOBILE_INFERENCE)
    return()
 ENDIF()
 include (ExternalProject)
 SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
 SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
    GIT_TAG "v1.7.x"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
    BUILD_IN_SOURCE 1
    BUILD_COMMAND   make
    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 # FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
 ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
 ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
 ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
 include_directories(${GRPC_INCLUDE_DIR})
 ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
-    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+ELSE()
-    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
@ -57,15 +56,16 @@ ExternalProject_Add(
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_USE_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
    IF(CMAKE_CROSSCOMPILING)
        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
            ENDIF()
        ELSEIF(IOS)
-            # FIXME(liuyiqun): support multiple architectures
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
            ELSE()
                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
            ENDIF()
        ELSEIF(RPI)
            # use hardfp
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 IF(MOBILE_INFERENCE)
    return()
 ENDIF()
 INCLUDE(ExternalProject)
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 LIST(APPEND external_project_dependencies zlib)
 ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 IF(WITH_C_API)
  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -149,58 +149,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
 LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
 endif()
 function(specify_cuda_arch cuda_version cuda_arch)
    if(${cuda_version} VERSION_GREATER "8.0")
        foreach(capability 61 62)
          if(${cuda_arch} STREQUAL ${capability})
            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
          endif()
        endforeach()
    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
    endif()
 endfunction()
 # Common gpu architectures: Kepler, Maxwell
 foreach(capability 30 35 50)
      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
 endforeach()
 if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
 endif()
 # Modern gpu architectures: Pascal
 if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
 endif()
 # Custom gpu architecture
 set(CUDA_ARCH)
 if(CUDA_ARCH)
  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
 endif()
 set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
  if(WITH_TESTING)
    set(options STATIC static SHARED shared)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             python2 ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction()
 # grpc_library generate grpc code using grpc_cpp_plugin and protoc
 # then build the generated protobuf code and grpc code with your
 # implementation source codes together. Use SRCS argument for your
 # implementation source files and PROTO argument for your .proto
 # files.
 #
 # Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
 function(grpc_library TARGET_NAME)
  set(oneValueArgs PROTO)
  set(multiValueArgs SRCS DEPS)
  set(options "")
  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  message(STATUS "generating grpc ${grpc_library_PROTO}")
  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
  add_custom_command(
          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
  # as compiler warnings instead of error. Should try remove the warnings also.
  set_source_files_properties(
    ${grpc_grpc_srcs}
    PROPERTIES
    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
  set_source_files_properties(
    ${grpc_library_SRCS}
    PROPERTIES
    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
        target_link_libraries(${TARGET_NAME} log)
    endif(ANDROID)
-    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
-      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
@ -168,17 +168,3 @@ function(create_resources res_file output_file)
    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
 # Create a python unittest using run_python_tests.sh,
 # which takes care of making correct running environment
 function(add_python_test TEST_NAME)
    foreach(arg ${ARGN})
        get_filename_component(py_fn ${arg} NAME_WE)
        set(TRG_NAME ${TEST_NAME}_${py_fn})
        add_test(NAME ${TRG_NAME}
                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
                python2 ${arg}
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
    endforeach()
 endfunction()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -54,7 +54,7 @@ img_conv
 ..  _api_v2.layer_context_projection:
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
    :noindex:
@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 spp
 ---
@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
    :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
    :noindex:
-    
+
 Recurrent Layers
 ================
@ -335,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
    :noindex:
 out_prod
 --------
 .. autoclass:: paddle.v2.layer.out_prod
    :noindex:
 power
 -----
 ..  autoclass:: paddle.v2.layer.power
@ -372,6 +382,11 @@ cos_sim
 ..  autoclass:: paddle.v2.layer.cos_sim
    :noindex:
 l2_distance
 -----------
 ..  autoclass:: paddle.v2.layer.l2_distance
    :noindex:
 trans
 -----
 ..  autoclass:: paddle.v2.layer.trans
@ -400,6 +415,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:
 Factorization Machine Layer
 ============================
 factorization_machine
 ---------------------
 ..  autoclass:: paddle.v2.layer.factorization_machine
    :noindex:
 Slicing and Joining Layers
 ==========================
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
 我们把集成方案大致分为了如下几个方面。
 ### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
-同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
-**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
--- a/doc/design/ops/images/2_level_rnn.dot
+++ b/doc/design/ops/images/2_level_rnn.dot
@ -1,6 +1,6 @@
 digraph G {
-  rnn [label="1-th level RNN" shape=box]
+  rnn [label="1st level RNN" shape=box]
  subgraph cluster0 {
    label = "time step 0"
@ -8,7 +8,7 @@ digraph G {
    sent0 [label="sentence"]
    sent1 [label="sentence"]
-    rnn1 [label="2-th level RNN" shape=box]
+    rnn1 [label="2nd level RNN" shape=box]
    sent0 -> rnn1
    sent1 -> rnn1
@ -20,7 +20,7 @@ digraph G {
    sent2 [label="sentence"]
    sent3 [label="sentence"]
-    rnn2 [label="2-th level RNN" shape=box]
+    rnn2 [label="2nd level RNN" shape=box]
    sent2 -> rnn2
    sent3 -> rnn2
@ -32,7 +32,7 @@ digraph G {
    sent4 [label="sentence"]
    sent5 [label="sentence"]
-    rnn3 [label="2-th level RNN" shape=box]
+    rnn3 [label="2nd level RNN" shape=box]
    sent4 -> rnn3
    sent5 -> rnn3
--- a/doc/design/ops/rnn.md
+++ b/doc/design/ops/rnn.md
@ -1,62 +1,62 @@
 # RNNOp design
-This document is about an RNN operator which requires that instances in a mini-batch have the same length.  We will have a more flexible RNN operator.
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
 ## RNN Algorithm Implementation
-<p aligh="center">
+<p align="center">
 <img src="./images/rnn.jpg"/>
 </p>
 The above diagram shows an RNN unrolled into a full network.
-There are several important concepts:
+There are several important concepts here:
- *step-net*: the sub-graph to run at each step,
+- *step-net*: the sub-graph that runs at each step.
- *memory*, $h_t$, the state of the current step,
+- *memory*, $h_t$, the state of the current step.
- *ex-memory*, $h_{t-1}$, the state of the previous step,
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
- *initial memory value*, the ex-memory of the first step.
+- *initial memory value*, the memory of the first (initial) step.
 ### Step-scope
-There could be local variables defined in step-nets.  PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step.
+There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
-<p aligh="center">
+<p align="center">
 <img src="./images/rnn.png"/><br/>
-Figure 2 the RNN's data flow
+Figure 2 illustrates the RNN's data flow
 </p>
-Please be aware that all steps run the same step-net.  Each step
+Please be aware that every step runs the same step-net.  Each step does the following:
-1. creates the step-scope,
+1. Creates the step-scope.
-2. realizes local variables, including step-outputs, in the step-scope, and
+2. Initializes the local variables including step-outputs, in the step-scope.
-3. runs the step-net, which could use these variables.
+3. Runs the step-net, which uses the above mentioned variables.
-The RNN operator will compose its output from step outputs in step scopes.
+The RNN operator will compose its output from step outputs in each of the step scopes.
 ### Memory and Ex-memory
-Let's give more details about memory and ex-memory via a simply example:
+Let's give more details about memory and ex-memory using a simple example:
 $$
 h_t = U h_{t-1} + W x_t
 $$,
-where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively.
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
-In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step,
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
-or copy the value of the previous memory value to the current ex-memory variable.
+or copy the memory value of the previous step to the current ex-memory variable.
 ### Usage in Python
 For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
-We can define an RNN's step-net using Block:
+We can define an RNN's step-net using a Block:
 ```python
 import paddle as pd
-X = some_op() # x is some operator's output, and is a LoDTensor
+X = some_op() # x is some operator's output and is a LoDTensor
 a = some_op()
 # declare parameters
@ -68,7 +68,7 @@ with rnn.stepnet():
    x = rnn.add_input(X)
    # declare a memory (rnn's step)
    h = rnn.add_memory(init=a)
-    # h.pre_state() means previous memory of rnn
+    # h.pre_state(), the previous memory of rnn
    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
    # update current memory
    h.update(new_state)
@ -80,19 +80,19 @@ out = rnn()
 Python API functions in above example:
- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
- `rnn.add_memory` creates a variable used as the memory.
+- `rnn.add_memory`: creates a variable used as the memory.
- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
 ### Nested RNN and LoDTensor
 An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
-For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences.
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
-The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text.
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
-<p aligh="center">
+<p align="center">
 <img src="./images/2_level_rnn.png"/>
 </p>
@ -110,7 +110,7 @@ a = some_op()
 # chapter_data is a set of 128-dim word vectors
 # the first level of LoD is sentence
-# the second level of LoD is chapter
+# the second level of LoD is a chapter
 chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
 def lower_level_rnn(paragraph):
@ -138,14 +138,14 @@ with top_level_rnn.stepnet():
        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
    top_level_rnn.add_outputs(h)
-# just output the last step
+# output the last step
 chapter_out = top_level_rnn(output_all_steps=False)
 ```
-in above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
-By default, the `RNNOp` will concatenate the outputs from all the time steps,
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
-if the `output_all_steps` set to False, it will only output the final time step.
+If the `output_all_steps` is set to False, it will only output the final time step.
 <p align="center">
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@ -1,35 +1,28 @@
 # Design: Sequence Decoder Generating LoDTensors
-In tasks such as machine translation and image to text, 
+In tasks such as machine translation and visual captioning,
-a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
 This documentation describes how to implement the sequence decoder as an operator.
 ## Beam Search based Decoder
-The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, 
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
 it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
-In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, 
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
 due to the complexity, the implementation relays on a lot of special data structures, 
 quite trivial and hard to be customized by users.
-There are a lot of heuristic tricks in the sequence generation tasks, 
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
 so the flexibility of sequence decoder is very important to users.
-During PaddlePaddle's refactoring work,
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
 some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
 and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
-For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
 the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
 ## Changing LoD's absolute offset to relative offsets
-The current `LoDTensor` is designed to store levels of variable-length sequences,
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
 it stores several arrays of integers each represents a level.
-The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, 
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
-let's call this format the **absolute-offset LoD** for clear.
+let's call this format the **absolute-offset LoD** for clarity.
-The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
+The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
 ```python
 [[0, 3, 9]
 [0, 2, 3, 3, 3, 9]]
@ -41,10 +34,9 @@ The first level tells that there are two sequences:
 while on the second level, there are several empty sequences that both begin and end at `3`.
 It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
-There are many scenarios that relay on empty sequence representation,
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
 such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
-So let's introduce another format of LoD, 
+So let's introduce another format of LoD,
 it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
 For example, to represent the same sequences of the above data
@ -54,19 +46,18 @@ For example, to represent the same sequences of the above data
 [0, 2, 3, 3, 3, 9]]
 ```
-the first level represents that there are two sequences, 
+the first level represents that there are two sequences,
 their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
 The second level is the same with the relative offset example because the lower level is a tensor.
 It is easy to find out the second sequence in the first-level LoD has two empty sequences.
-The following demos are based on relative-offset LoD.
+The following examples are based on relative-offset LoD.
 ## Usage in a simple machine translation model
-Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
-The model has an encoder that learns the semantic vector from a sequence,
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
 and a decoder which uses the sequence decoder to generate new sentences.
 **Encoder**
 ```python
@ -117,7 +108,7 @@ def generate():
        # which means there are 2 sentences to translate
        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
-        # the target_word.lod is 
+        # the target_word.lod is
        # [[0, 1, 6]
        #  [0, 2, 4, 7, 9 12]]
        # which means 2 sentences to translate, each has 1 and 5 prefixes
@ -154,37 +145,36 @@ def generate():
 translation_ids, translation_scores = decoder()
 ```
-The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
-return the result of the beam search algorithm.
+returns the result of the beam search algorithm.
-In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
+In this way, users can customize anything on the input or output of beam search, for example:
-1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
-2. remove some specific candidate in `selected_ids`
+2. Remove some specific candidate in `selected_ids`.
-3. get the final `translation_ids`, remove the translation sequence in it.
+3. Get the final `translation_ids`, remove the translation sequence in it.
-The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
-so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
-Both of them are two-level `LoDTensors`
+Both of them are two-level `LoDTensors`:
- the first level represents `batch_size` of (source) sentences;
+- The first level represents `batch_size` of (source) sentences.
- the second level represents the candidate ID sets for translation prefix.
+- The second level represents the candidate ID sets for translation prefix.
-for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
-Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
 a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
-For example, the previous state
+For example, the previous state:
 * LoD is `[0, 1, 3][0, 2, 5, 6]`
 * content of tensor is `a1 a2 b1 b2 b3 c1`
-the current state stored in `encoder_ctx_expanded`
+the current state is stored in `encoder_ctx_expanded`:
 * LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
-* the content is 
+* the content is
  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
  - a2 a2
  - b1 b1 b1
@ -192,54 +182,48 @@ the current state stored in `encoder_ctx_expanded`
  - b3 b3
  - None (c1 has 0 candidates, so c1 is dropped)
-Benefit from the relative offset LoD, empty candidate set can be represented naturally.
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
-the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is 
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
 ```python
 decoder.output(selected_ids)
 decoder.output(selected_generation_scores)
 ```
-the `selected_ids` is the candidate ids for the prefixes, 
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
 it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
 the first level represents the source sequences,
 the second level represents generated sequences.
-Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
-Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
 ## LoD and shape changes during decoding
 <p align="center">
  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
 </p>
-According the image above, the only phrase to change LoD is beam search.
+According to the image above, the only phase that changes the LoD is beam search.
 ## Beam search design
-The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
-1. `topk_ids`, top K candidate ids for each prefix.
+1. `topk_ids`, the top K candidate ids for each prefix.
 2. `topk_scores`, the corresponding scores for `topk_ids`
 3. `generated_scores`, the score of the prefixes.
-All of the are LoDTensors, so that the sequence affilication is clear.
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
 Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
-It will return three variables
+It will return three variables:
 1. `selected_ids`, the final candidate beam search function selected for the next step.
 2. `selected_scores`, the scores for the candidates.
-3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
 ## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
-The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
 and they exist in each time step,
 so it is natural to store them in arrays.
-Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
 the results of beam search are better to store in a `TensorArray`.
-The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. 
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
-It needs some extensions to support pack or unpack an array of `LoDTensors`.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
--- a/Show More
+++ b/Show More