Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_bn_eq

7 years ago · 90e05a4b8c
parent 5502abb95b 67bd4cd641
commit 90e05a4b8c
171 changed files with 4705 additions and 11026 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,8 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@ -82,10 +81,8 @@ if(ANDROID OR IOS)
        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
    set(WITH_RDMA OFF CACHE STRING
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLDNN OFF CACHE STRING
+    set(WITH_MKL OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLML OFF CACHE STRING
        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
@ -111,6 +108,14 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 set(WITH_MKLML ${WITH_MKL})
 if (WITH_MKL AND AVX2_FOUND)
    set(WITH_MKLDNN ON)
 else()
    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
    set(WITH_MKLDNN OFF)
 endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
 )
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+  include(cuda)
    if(NOT WITH_DSO)
        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
    endif(NOT WITH_DSO)
 endif(WITH_GPU)
 if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 if(USE_NNPACK)
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@ -12,11 +12,11 @@ Machine:
 System: CentOS release 6.3 (Final), Docker 1.12.1.
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
+PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
-
+- MKL-DNN tag v0.11
- MKL-DNN tag v0.10
+- MKLML 2018.0.1.20171007
 - MKLML 2018.0.20170720
 - OpenBLAS v0.2.20
 (TODO: will rerun after 0.11.0)
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
@ -31,15 +31,26 @@ Input image size - 3 * 224 * 224, Time: images/second
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
+| MKLML        | 12.12 | 13.70 | 16.18  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
 chart on batch size 128
 TBD
 - ResNet-50
 | BatchSize    | 64    | 128   | 256    |
 |--------------|-------| ------| -------|
 | OpenBLAS     | 25.22 | 25.68 | 27.12  | 
 | MKLML        | 32.52 | 31.89 | 33.12  |
 | MKL-DNN      | 81.69 | 82.35 | 84.08  |
 chart on batch size 128
 TBD
 - ResNet
 - GoogLeNet
 ### Laptop
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@ -5,6 +5,7 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
 define_py_data_sources2(
@ -16,6 +17,8 @@ settings(
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))
 conv_projection = conv_projection if use_gpu else img_conv_layer
 def inception2(name, input, channels, \
    filter1,
    filter3R, filter3,
@ -138,7 +141,7 @@ def inception(name, input, channels, \
    cat = concat_layer(
        name=name,
        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
        act=ReluActivation())
    return cat
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@ -40,6 +40,7 @@ fi
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50  $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
  done
 done
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -76,27 +76,14 @@ else()
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
-if(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
-    add_definitions(-DPADDLE_USE_MKLDNN)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
+    set(OPENMP_FLAGS "-fopenmp")
-        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(OPENMP_FLAGS "-fopenmp")
+    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+endif()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
    else()
        find_package(OpenMP)
        if(OPENMP_FOUND)
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
        else()
            message(WARNING "Can not find OpenMP."
                 "Some performance features in MKLDNN may not be available")
        endif()
    endif()
 endif(WITH_MKLDNN)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -0,0 +1,188 @@
 if(NOT WITH_GPU)
    return()
 endif()
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
 ######################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
 # Usage:
 #   detect_installed_gpus(out_variable)
 function(detect_installed_gpus out_variable)
  if(NOT CUDA_gpu_detect_output)
    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
    file(WRITE ${cufile} ""
      "#include <cstdio>\n"
      "int main() {\n"
      "  int count = 0;\n"
      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
      "  if (count == 0) return -1;\n"
      "  for (int device = 0; device < count; ++device) {\n"
      "    cudaDeviceProp prop;\n"
      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
      "  }\n"
      "  return 0;\n"
      "}\n")
    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
                    "--run" "${cufile}"
                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
    if(nvcc_res EQUAL 0)
      # only keep the last line of nvcc_out
      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
      list(GET nvcc_out -1 nvcc_out)
      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
    endif()
  endif()
  if(NOT CUDA_gpu_detect_output)
    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
  else()
    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
  endif()
 endfunction()
 ########################################################################
 # Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
 # Usage:
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
  set(archs_name_default "All")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND archs_names "Auto")
  endif()
  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
  mark_as_advanced(CUDA_ARCH_NAME)
  # verify CUDA_ARCH_NAME value
  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
    string(REPLACE ";" ", " archs_names "${archs_names}")
    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
  endif()
  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
  else()
    unset(CUDA_ARCH_BIN CACHE)
    unset(CUDA_ARCH_PTX CACHE)
  endif()
  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
    set(cuda_arch_bin "30 35")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
    set(cuda_arch_bin "50")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
    set(cuda_arch_bin "60 61")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
    detect_installed_gpus(cuda_arch_bin)
  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(cuda_arch_bin ${CUDA_ARCH_BIN})
  endif()
  # remove dots and convert to lists
  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
  list(REMOVE_DUPLICATES cuda_arch_bin)
  list(REMOVE_DUPLICATES cuda_arch_ptx)
  set(nvcc_flags "")
  set(nvcc_archs_readable "")
  # Tell NVCC to add binaries for the specified GPUs
  foreach(arch ${cuda_arch_bin})
    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
      # User explicitly specified PTX for the concrete BIN
      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
    else()
      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
      list(APPEND nvcc_archs_readable sm_${arch})
    endif()
  endforeach()
  # Tell NVCC to add PTX intermediate code for the specified architectures
  foreach(arch ${cuda_arch_ptx})
    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
    list(APPEND nvcc_archs_readable compute_${arch})
  endforeach()
  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
 endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
 endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
 endif(NOT WITH_DSO)
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
 message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
 # Set C++11 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
 endif()
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
-    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+ELSE()
-    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
@ -57,15 +56,16 @@ ExternalProject_Add(
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_USE_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -149,58 +149,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
 LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
 endif()
 function(specify_cuda_arch cuda_version cuda_arch)
    if(${cuda_version} VERSION_GREATER "8.0")
        foreach(capability 61 62)
          if(${cuda_arch} STREQUAL ${capability})
            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
          endif()
        endforeach()
    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
    endif()
 endfunction()
 # Common gpu architectures: Kepler, Maxwell
 foreach(capability 30 35 50)
      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
 endforeach()
 if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
 endif()
 # Modern gpu architectures: Pascal
 if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
 endif()
 # Custom gpu architecture
 set(CUDA_ARCH)
 if(CUDA_ARCH)
  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
 endif()
 set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
        target_link_libraries(${TARGET_NAME} log)
    endif(ANDROID)
-    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
-      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -335,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
    :noindex:
 out_prod
 --------
 .. autoclass:: paddle.v2.layer.out_prod
    :noindex:
 power
 -----
 ..  autoclass:: paddle.v2.layer.power
@ -372,6 +382,11 @@ cos_sim
 ..  autoclass:: paddle.v2.layer.cos_sim
    :noindex:
 l2_distance
 -----------
 ..  autoclass:: paddle.v2.layer.l2_distance
    :noindex:
 trans
 -----
 ..  autoclass:: paddle.v2.layer.trans
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
 我们把集成方案大致分为了如下几个方面。
 ### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
-同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
-**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
    cd TO_YOUR_PADDLE_CLONE_PATH
    mkdir -p build
    cd build
-    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
    make gen_proto_py
    make paddle_docs paddle_docs_cn
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -513,19 +513,14 @@ ParamGradInfoMap AppendBackward(
  const int root_block_idx = 0;
  auto root_block = program_desc.MutableBlock(root_block_idx);
  // insert fill one op for target
  // TODO(qiao) add some check to the target.
  std::string fill_one_op_out = GradVarName(target.Name());
-  std::vector<int64_t> target_shape_desc = target.Shape();
+  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
-  std::vector<int> target_shape;
+  PADDLE_ENFORCE(is_scalar, "target should be scalar");
  std::transform(target_shape_desc.begin(), target_shape_desc.end(),
                 std::back_inserter(target_shape),
                 [](int64_t dim) { return static_cast<int>(dim); });
  VLOG(3) << "backward from loss=" << target.Name()
          << " data_type=" << target.GetDataType();
  std::unique_ptr<OpDescBind> fill_one_op(
      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", target_shape},
+                     {{"shape", std::vector<int>{1}},
                      {"value", static_cast<float>(1.0)},
                      {"data_type", target.GetDataType()}}));
  // infer var type of fill_one_op
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) {
  op->SetOutput("Out", {"out"});
  auto target = f::VarDescBind("out");
  target.SetShape({1});
  auto var_to_grad = AppendBackward(program, target, {});
  ASSERT_EQ(block->AllOps().size(), 3UL);
@ -544,6 +545,7 @@ TEST(Backward, default_attribute) {
  op->CheckAttrs();
  auto target = f::VarDescBind("out");
  target.SetShape({1});
  AppendBackward(program, target, {});
  ASSERT_EQ(block->AllOps().size(), 3UL);
@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) {
  op3->SetOutput("Out", {"out3"});
  auto target = f::VarDescBind("out3");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {});
@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) {
  op4->SetOutput("Out", {"out4"});
  auto target = f::VarDescBind("out4");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"out3"});
@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) {
  op2->SetOutput("Z", {"z2"});
  auto target = f::VarDescBind("z2");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"z1"});
@ -810,6 +815,7 @@ TEST(Backward, shared_var) {
  op3->SetOutput("Out", {"out3"});
  auto target = f::VarDescBind("out3");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {});
@ -888,6 +894,7 @@ TEST(Backward, half_backward) {
  op1->SetOutput("Out", {"out"});
  auto target = f::VarDescBind("out");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"b"});
  f::OpDescBind *fill_op = block->AllOps()[forward_len];
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -46,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) {
      return typeid(int);
    case DataType::INT64:
      return typeid(int64_t);
    case DataType::BOOL:
      return typeid(bool);
    default:
      PADDLE_THROW("Not support type %d", type);
  }
@ -66,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) {
    case DataType::INT64:
      visitor.template operator()<int64_t>();
      break;
    case DataType::BOOL:
      visitor.template operator()<bool>();
      break;
    default:
      PADDLE_THROW("Not supported");
  }
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
    list(REMOVE_ITEM GSERVER_SOURCES
         dataproviders/DataProvider.cpp
         dataproviders/MultiDataProvider.cpp
         dataproviders/ProtoDataProvider.cpp
         dataproviders/PyDataProvider2.cpp
         dataproviders/PyDataProvider.cpp)
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 /*
 * @brief SoftSign Activation.
 * \f[
 * f(z) = \frac{z}{1 + |z|}
 * \f]
 */
 BEGIN_DEFINE_ACTIVATION(softsign)
 private:
 MatrixPtr denominator_;
 Error __must_check forward(Argument& act) {
  size_t height = act.value->getHeight();
  size_t width = act.value->getWidth();
  Matrix::resizeOrCreate(
      denominator_, height, width, false, useGpu(act.deviceId));
  denominator_->assign(*act.value);
  denominator_->abs2();
  denominator_->add(1.);
  act.value->dotDiv(*act.value, *denominator_);
  return Error();
 }
 Error __must_check backward(Argument& act) {
  denominator_->square2();
  denominator_->scalarDiv(*denominator_, 1.);
  act.grad->dotMul(*act.grad, *denominator_);
  return Error();
 }
 END_DEFINE_ACTIVATION(softsign)
 /**
 * @brief Relu Activation.
 * forward. y = max(0, z)
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@ -16,8 +16,8 @@ limitations under the License. */
 #include <unistd.h>
 #include <algorithm>
 #include "ProtoDataProvider.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
 REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
 REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
 REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
 int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@ -1,179 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <vector>
 #include "DataFormat.pb.h"
 #include "paddle/utils/Stat.h"
 #include "DataProvider.h"
 #include "ProtoReader.h"
 namespace paddle {
 /**
 * @brief Provider data from protobuf data file with each sample
 * specified by proto message
 *
 * DataSample defined in DataFormat.proto.
 *
 * The file format is
 *
 *    header
 *
 *    sample1
 *
 *    sample2
 *
 *    ...
 *
 *    sampleN
 *
 * @note: In the data file, each message is prefixed with its length.
 * The read/write of the protbuf are implemented in ProtoReader.h
 */
 class ProtoDataProvider : public DataProvider {
 public:
  ProtoDataProvider(const DataConfig& config,
                    bool useGpu,
                    bool loadDataAll = true);
  virtual void reset();
  /**
   * @note this size includes the sequences which are skipped because they
   * are longer than the batch size.
   */
  virtual int64_t getSize() {
    int64_t size = sampleNums_;
    if (usageRatio_ < 1.0f) {
      size = static_cast<int64_t>(size * usageRatio_);
    }
    return size;
  }
  virtual void shuffle();
  void loadData(const std::vector<std::string>& fileList);
  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 protected:
  /**
   * @brief load protobuf data from a list of file
   * @param[in]  fileName  file name of a file which contains
   * a list of file names
   */
  void loadData(const std::string& fileName);
  /**
   * @brief load protobuf data from file
   * @param[in]  fileName   data file name
   */
  void loadDataFile(const std::string& fileName);
  /** @brief check data header of each data sample
   *  @param[in] header     data header read from protobuf data
   */
  void checkDataHeader(const DataHeader& header);
  /**
   * @brief fill protobuf data into slot_,
   * slot_ is a vector of ProtoSlot in memory.
   * @param[in]  sample     data sample read from protobuf data
   */
  void fillSlots(const DataSample& sample);
  /**
   * @brief return true if each sample is one sequence, i.e., independent
   * of other samples.
   */
  inline bool iidData() const { return sequenceStartPositions_.empty(); }
  /**
   * @brief check that sample is consistent with header_
   */
  void checkSample(const DataSample& sample);
  template <class Op>
  int64_t sequenceLoop(Op op, int64_t size);
  template <class Op>
  int64_t sampleLoop(Op op, int64_t size);
  template <class Op>
  int64_t subSampleLoop(Op op, int64_t size, int slot);
  void showDataStats();
 protected:
  struct ProtoVarSlot {
    std::vector<real> data;
    std::vector<int> dims;
  };
  struct ProtoSlot {
    SlotDef::SlotType type;
    int dim;
    std::vector<int> indexData;
    std::vector<real> denseData;
    std::vector<sparse_non_value_t> sparseNonValueData;
    std::vector<sparse_float_value_t> sparseFloatValueData;
    std::vector<int64_t> indices;
    std::vector<int64_t> subIndices;
    std::vector<ProtoVarSlot> varDenseData;
    std::vector<std::vector<int>> varIndices;
    std::vector<std::string> strData;
  };
  DataHeader header_;
  int numVecSlots_;
  std::vector<ProtoSlot> slots_;
  size_t sampleNums_;
  /**
   * The starting position of each sequence in samples.
   * The last element should be num of samples.
   * If empty, each sample is one sequence.
   */
  std::vector<size_t> sequenceStartPositions_;
  int64_t currentSequenceIndex_;
  // The size should be the number of sequences.
  std::vector<size_t> shuffledSequenceIds_;
  ThreadLocalD<DataBatch> cpuBatch_;
  ThreadLocalD<DataBatch> gpuBatch_;
  RWLock lock_;
  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
 };
 /**
 * @brief Special use for Proto data: instances should contain sparse-non-value
 * slots
 * and label.
 *
 * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
 */
 class ProtoSequenceDataProvider : public ProtoDataProvider {
 public:
  ProtoSequenceDataProvider(const DataConfig& config,
                            bool useGpu,
                            bool loadDataAll = true);
  ~ProtoSequenceDataProvider() {}
  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@ -0,0 +1,97 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 namespace paddle {
 /**
 * @brief A layer for computing the dot product of two vectors.
 * Input1: vector (batchSize * dim)
 * Input2: vector (batchSize * dim)
 * Output: a matrix: (batchSize * 1)
 */
 class DotProdLayer : public Layer {
 public:
  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
  ~DotProdLayer() {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback = nullptr) override;
 };
 REGISTER_LAYER(dot_prod, DotProdLayer);
 bool DotProdLayer::init(const LayerMap& layerMap,
                        const ParameterMap& parameterMap) {
  Layer::init(layerMap, parameterMap);
  CHECK_EQ(inputLayers_.size(), 2U);
  CHECK_EQ(1UL, getSize())
      << "The output dimensionality of this layer should be fixed to 1.";
  return true;
 }
 void DotProdLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inV0 = getInputValue(0);
  MatrixPtr inV1 = getInputValue(1);
  size_t batchSize = inV0->getHeight();
  CHECK_EQ(inV1->getHeight(), batchSize);
  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
  {
    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
    reserveOutput(batchSize, 1);
  }
  MatrixPtr outV = getOutputValue();
  {
    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
    outV->sumOfProducts(*inV0, *inV1, 1, 0);
  }
 }
 void DotProdLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inV0 = getInputValue(0);
  MatrixPtr inV1 = getInputValue(1);
  MatrixPtr outG = getOutputGrad();
  MatrixPtr inG0 = getInputGrad(0);
  MatrixPtr inG1 = getInputGrad(1);
  {
    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
    if (inG0) {
      inG0->addRowScale(0, *inV1, *outG);
    }
    if (inG1) {
      inG1->addRowScale(0, *inV0, *outG);
    }
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/L2DistanceLayer.cpp
+++ b/paddle/gserver/layers/L2DistanceLayer.cpp
@ -0,0 +1,91 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "L2DistanceLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 namespace paddle {
 REGISTER_LAYER(l2_distance, L2DistanceLayer);
 bool L2DistanceLayer::init(const LayerMap& layerMap,
                           const ParameterMap& parameterMap) {
  /* Initialize the basic parent class */
  Layer::init(layerMap, parameterMap);
  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
                                     << "only two inputs.";
  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
                           << "is fixed to be 1.";
  return true;
 }
 void L2DistanceLayer::forward(PassType passType) {
  Layer::forward(passType);
  const auto inV1 = getInputValue(0);
  const auto inV2 = getInputValue(1);
  CHECK(inV1 && inV2);
  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
      << "The height of two inputs of this layer must be the same.";
  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
      << "The width of two inputs of this layer must be the same.";
  int batchSize = inV1->getHeight();
  int output_dim = getSize();
  {
    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
    reserveOutput(batchSize, output_dim);
    auto outV = getOutputValue();
    CHECK(outV) << "The output matrix should not be null.";
    Matrix::resizeOrCreate(
        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
    inputSub_->assign(*inV1);
    inputSub_->sub(*inV2);
    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
    outV->sqrt2(*outV);
  }
 }
 void L2DistanceLayer::backward(const UpdateCallback& callback) {
  const auto outG = getOutputGrad();
  const auto outV = getOutputValue();
  CHECK(outG && outV);
  auto inGrad1 = getInputGrad(0);
  auto inGrad2 = getInputGrad(1);
  {
    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
    if (inGrad1 || inGrad2) {
      outV->scalarDiv(*outV, 1.);
      outV->dotMul(*outG, *outV);
    }
    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
    if (inGrad2) {
      inputSub_->mulScalar(-1.);
      inGrad2->addRowScale(0, *inputSub_, *outV);
    }
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ b/paddle/gserver/layers/L2DistanceLayer.h
@ -0,0 +1,52 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 namespace paddle {
 /**
 * @brief The layer calculates the l2 distance between two input vectors.
 * \f[
 * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
 * \f]
 *
 * - Input1: A vector (batchSize * dataDim)
 * - Input2: A vector (batchSize * dataDim)
 * - Output: A vector (batchSize * 1)
 *
 * The configuration api is: l2_distance_layer.
 */
 class L2DistanceLayer : public Layer {
 public:
  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
  ~L2DistanceLayer() {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback = nullptr) override;
 private:
  // Store the result of subtracting Input2 from Input1 in forward computation,
  // which will be reused in backward computation.
  MatrixPtr inputSub_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
 }
 void MKLDNNAddtoLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
  reshapeInput(bs, ih, iw);
  ic = inputLayers_[0]->getSize() / ih / iw;
  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
           (size_t)bs * ic * ih * iw);
  for (size_t i = 0; i < inputLayers_.size(); i++) {
    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape(
 }
 void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inVals_, bias, out);
+  resetFwdBuffers(inputs, biasVal_, out);
  in = inVals_[0];
  std::shared_ptr<sum::primitive_desc> fwdPD;
  std::shared_ptr<sum::primitive_desc> biasPD;
-  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
-  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
 }
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, bias, out);
+  resetBwdBuffers(inputs, biasGrad_, out);
  in = inGrads_[0];
  // backward only need share output grad to input grad
-  for (size_t i = 0; i < inGrads_.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
-    if (inGrads_[i] != nullptr) {
+    if (inputs[i] != nullptr) {
-      inGrads_[i] = out;
+      inputs[i] = out;
-      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
    }
  }
  // backward bias
  bwdBias_ = nullptr;
-  if (bias) {
+  if (biasGrad_) {
    std::vector<float> scales(bs_, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
+    std::vector<memory::primitive_desc> srcPDs(bs_,
-    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+                                               biasGrad_->getPrimitiveDesc());
    auto biasPD =
        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
    std::vector<primitive::at> srcs;
    for (size_t i = 0; i < grads_.size(); ++i) {
      srcs.push_back(*(grads_[i]));
    }
-    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
    pipeline.push_back(*bwdBias_);
  }
 }
@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
  inputs.resize(inputLayers_.size());
  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
  }
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@ -26,9 +26,6 @@ namespace paddle {
 */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
 protected:
  std::vector<MKLDNNMatrixPtr> inVals_;
  std::vector<MKLDNNMatrixPtr> inGrads_;
  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
  size_t layerSize_;
@ -50,52 +47,19 @@ public:
            const ParameterMap& parameterMap) override;
  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& wgt,
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& wgt,
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateWeights(const UpdateCallback& callback) override;
  void printValueFormat() override {
    for (size_t i = 0; i < inVals_.size(); ++i) {
      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
    }
    if (outVal_) {
      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
    }
    if (extOutVal_) {
      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
    }
  }
  void printGradFormat() override {
    if (extOutGrad_) {
      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
    }
    if (outGrad_) {
      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
    }
    for (size_t i = 0; i < inGrads_.size(); ++i) {
      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
    }
  }
 protected:
  /**
   * Forward functions: reset buffers(inputs, output, bias),
   *                    reset primitive descriptor,
   *                    reset pipeline.
   */
  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
@ -110,17 +74,10 @@ protected:
                        std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
  /**
   * Backward functions: reset buffers(inputs, output, bias)
   */
  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  /**
   * prepare for bias
   */
  void prepareBias(MKLDNNMatrixPtr& bias,
                   const MatrixPtr& biasMat,
                   const MKLDNNMatrixPtr& out,
--- a/Show More
+++ b/Show More