From 5030681c36e9e9497f3c45cdbd451c8739bdba1f Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 8 Mar 2018 20:41:31 +0800
Subject: [PATCH 01/58] add MKL for fluid static and shared library

---
 cmake/external/mklml.cmake | 2 +-
 cmake/inference_lib.cmake  | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 739a910c7c..f24cb2d11b 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
-SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
 SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 6b2237b858..fb81498fd6 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND)
       SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
       DSTS ${dst_dir} ${dst_dir}
     )
+else()
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+    copy(mklml_lib
+      SRCS ${MKLML_LIB_DIR} ${MKLML_INC_DIR}
+      DSTS ${dst_dir} ${dst_dir}
+    )
 endif()
 
 # paddle fluid module

From bc0cfb2283633b65669be1d8f7a7f2040d6726f2 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 8 Mar 2018 20:42:16 +0800
Subject: [PATCH 02/58] remove PADDLE_USE_ATLAS

---
 paddle/fluid/operators/math/math_function.h |  7 -------
 paddle/math/MathFunctions.cpp               | 15 ++++-----------
 paddle/math/MathFunctions.h                 |  2 +-
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 47e2386d05..cdbc7bfb37 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -19,13 +19,6 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
 
-#ifdef PADDLE_USE_ATLAS
-extern "C" {
-#include <cblas.h>
-#include <clapack.h>
-}
-#endif
-
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
 #include <lapacke.h>
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index b2ff4bc323..de404cad89 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr;
   } __name;  // struct DynLoad__##__name
 #endif
 
-#ifdef PADDLE_USE_ATLAS
-  #define  PADDLE_SGETRF  clapack_sgetrf
-  #define  PADDLE_DGETRF  clapack_dgetrf
-  #define  PADDLE_SGETRI  clapack_sgetri
-  #define  PADDLE_DGETRI  clapack_dgetri
-#else
-  #define  PADDLE_SGETRF  LAPACKE_sgetrf
-  #define  PADDLE_DGETRF  LAPACKE_dgetrf
-  #define  PADDLE_SGETRI  LAPACKE_sgetri
-  #define  PADDLE_DGETRI  LAPACKE_dgetri
-#endif
+#define  PADDLE_SGETRF  LAPACKE_sgetrf
+#define  PADDLE_DGETRF  LAPACKE_dgetrf
+#define  PADDLE_SGETRI  LAPACKE_sgetri
+#define  PADDLE_DGETRI  LAPACKE_dgetri
 
 #define LAPACK_ROUTINE_EACH(__macro)       \
   __macro(PADDLE_SGETRF)                   \
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index f4cf6bd6c2..f3d8b1a39e 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
 
-#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
+#if defined(PADDLE_USE_VECLIB)
 extern "C" {
 #include <cblas.h>
 #include <clapack.h>

From e42b8f8a11c344173c6d276fbdfdef1f13c17d19 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 13 Mar 2018 16:03:26 +0800
Subject: [PATCH 03/58] fix mklml install path

---
 cmake/external/mklml.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index f24cb2d11b..df3f0c7f0c 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
   "PROJECT(MKLML)\n"
   "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${MKLML_VER}\n"
+  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
   "        DESTINATION ${MKLML_DST_DIR})\n")
 
 ExternalProject_Add(

From 45c988d86a43bf34667ce7110972fff8dcaf20de Mon Sep 17 00:00:00 2001
From: sabreshao <sabre.shao@amd.com>
Date: Fri, 16 Mar 2018 17:27:19 +0800
Subject: [PATCH 04/58] Demostration of cmake refine for HIP support.

1. Add option WITH_AMD_GPU.
2. Add cmake/hip.cmake for HIP toolchain.
3. Some external module such as eigen may need HIP port.
4. Add macro hip_library/hip_binary/hip_test to cmake/generic.cmake.
5. Add one HIP source concat.hip.cu as an example. Each .cu may have its corresponding .hip.cu.
---
 CMakeLists.txt                             |   9 +
 cmake/configure.cmake                      |  15 +-
 cmake/external/eigen.cmake                 |  43 +++-
 cmake/generic.cmake                        |  76 ++++++
 cmake/hip.cmake                            |  46 ++++
 paddle/fluid/operators/CMakeLists.txt      |   3 +
 paddle/fluid/operators/math/CMakeLists.txt |   6 +
 paddle/fluid/operators/math/concat.hip.cu  | 281 +++++++++++++++++++++
 paddle/fluid/pybind/CMakeLists.txt         |  21 +-
 paddle/scripts/docker/build.sh             |   4 +
 10 files changed, 477 insertions(+), 27 deletions(-)
 create mode 100644 cmake/hip.cmake
 create mode 100644 paddle/fluid/operators/math/concat.hip.cu
 mode change 100644 => 100755 paddle/scripts/docker/build.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ec65bac84..399bf50748 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ include(simd)
 
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
@@ -69,6 +70,9 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
+if(WITH_AMD_GPU)
+endif()
+
 if(ANDROID OR IOS)
     if(ANDROID)
         if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
@@ -180,6 +184,11 @@ if(WITH_GPU)
     include(cuda)
 endif(WITH_GPU)
 
+if(WITH_AMD_GPU)
+    find_package(HIP)
+    include(hip)
+endif(WITH_AMD_GPU)
+
 if(WITH_MKLML)
     list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 0f76f55270..f726405c47 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
 
-if(NOT WITH_GPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
+if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
 
     FIND_PACKAGE(CUDA REQUIRED)
@@ -84,7 +80,14 @@ else()
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+elseif(WITH_AMD_GPU)
+    add_definitions(-DPADDLE_WITH_HIP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()
 
 if (WITH_MKLML AND MKLML_IOMP_LIB)
     message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6a701e076c..5d88c5a0b0 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -1,21 +1,36 @@
 INCLUDE(ExternalProject)
 
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
-SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
-INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
 
-ExternalProject_Add(
-    extern_eigen3
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
-    PREFIX          ${EIGEN_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
+INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
+
+if(WITH_AMD_GPU)
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+else()
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+endif()
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 471e392906..c749c97f13 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME)
   endif()
 endfunction(nv_test)
 
+function(hip_library TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_SRCS)
+      if (hip_library_SHARED OR hip_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+	find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (hip_library_DEPS)
+	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_SRCS})
+	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
+    else(hip_library_SRCS)
+      if (hip_library_DEPS)
+	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+      else()
+	message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_SRCS)
+  endif()
+endfunction(hip_library)
+
+function(hip_binary TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+    if(hip_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+    endif()
+  endif()
+endfunction(hip_binary)
+
+function(hip_test TARGET_NAME)
+  if (WITH_AMD_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_test_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(hip_test)
+
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
new file mode 100644
index 0000000000..cd880603a7
--- /dev/null
+++ b/cmake/hip.cmake
@@ -0,0 +1,46 @@
+if(NOT WITH_AMD_GPU)
+    return()
+endif()
+
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+
+if(WITH_DSO)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+
+if(WITH_TESTING)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+# Disable optimization since one eigen symbol will be removed in math_function.cu
+    #list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d30124d4a3..26d1dab1e9 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -76,6 +76,9 @@ function(op_library TARGET)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS
+                ${op_library_DEPS} ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
             ${op_common_deps})
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index fba1612d10..1cac62472c 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,6 +6,7 @@ function(math_library TARGET)
     # But it handle split GPU/CPU code and link some common library.
     set(cc_srcs)
     set(cu_srcs)
+    set(hip_srcs)
     set(math_common_deps device_context framework_proto)
     set(multiValueArgs DEPS)
     cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
@@ -17,10 +18,15 @@ function(math_library TARGET)
     if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
         list(APPEND cu_srcs ${TARGET}.cu)
     endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+        list(APPEND hip_srcs ${TARGET}.hip.cu)
+    endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     elseif(${cc_srcs_len} GREATER 0)
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     endif()
diff --git a/paddle/fluid/operators/math/concat.hip.cu b/paddle/fluid/operators/math/concat.hip.cu
new file mode 100644
index 0000000000..91efd8ea57
--- /dev/null
+++ b/paddle/fluid/operators/math/concat.hip.cu
@@ -0,0 +1,281 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hip/hip_runtime.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__device__ T upper_bound(const T* first, T count, T val) {
+  const T* orig = first;
+  const T* it = nullptr;
+  T step = 0;
+  while (count > 0) {
+    it = first;
+    step = count / 2;
+    it += step;
+    if (!(val < *it)) {
+      first = ++it;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first - orig;
+}
+
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
+
+  int curr_offset = input_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* input_ptr = inputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * segment_width + local_col];
+  }
+}
+
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int input_col,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* input_ptr = inputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * input_col + in_offset];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int* output_cols,
+                                 int col_size, T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
+  int curr_offset = output_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * segment_width + local_col] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int output_cols,
+                                 T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* output_ptr = outputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * output_cols + in_offset] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int cols = input[0].numel() / rows;
+    int out_rows = rows, out_cols = 0;
+
+    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_cols(num + 1);
+    inputs_cols[0] = 0;
+    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+
+    bool sameShape = true;
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      if (sameShape) {
+        if (t_cols != cols) sameShape = false;
+      }
+      out_cols += t_cols;
+      inputs_cols[i + 1] = out_cols;
+      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+    }
+
+    T** ins_gpu =
+        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
+    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
+
+    // computation
+    // set the thread block and grid according to CurrentDeviceId
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_cols + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+    int grid_cols =
+        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+    if (sameShape) {
+      hipLaunchKernelGGL((KernelConcat<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
+          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+    } else {
+      hipLaunchKernelGGL((KernelConcat<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
+          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
+          out_cols, output->data<T>());
+    }
+  }
+};
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const int axis,
+                  std::vector<framework::Tensor>& outputs) {
+    // TODO(zcd): Add input data validity checking
+    int num = outputs.size();
+    int input_row = 1;
+    auto dim_0 = outputs[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      input_row *= dim_0[i];
+    }
+
+    int output_col_0 = outputs[0].numel() / input_row;
+    int input_col = 0;
+    bool sameShape = true;
+
+    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(num + 1);
+    outputs_cols[0] = 0;
+    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
+
+    for (int i = 0; i < num; ++i) {
+      int t_col = outputs[i].numel() / input_row;
+      if (sameShape) {
+        if (t_col != output_col_0) sameShape = false;
+      }
+      input_col += t_col;
+      outputs_cols[i + 1] = input_col;
+      outputs_ptr[i] = outputs[i].data<T>();
+    }
+
+    T** outs_gpu =
+        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
+    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
+
+    // computation
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((input_col + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+    int grid_cols =
+        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+    if (sameShape) {
+      hipLaunchKernelGGL((KernelConcatGrad<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
+          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+    } else {
+      hipLaunchKernelGGL((KernelConcatGrad<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
+          input.data<T>(), input_row, input_col, outs_col_gpu,
+          static_cast<int>(outputs_cols.size()), outs_gpu);
+    }
+  }
+};
+
+template class ConcatFunctor<platform::CUDADeviceContext, int>;
+template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatFunctor<platform::CUDADeviceContext, float>;
+template class ConcatFunctor<platform::CUDADeviceContext, double>;
+
+template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 8942b5c943..d523ad7f73 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,9 +1,16 @@
 if(WITH_PYTHON)
-  cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
-    ${GLOB_OP_LIB})
-  if(NOT APPLE AND NOT ANDROID)
-    target_link_libraries(paddle_pybind rt)
-  endif(NOT APPLE AND NOT ANDROID)
+  if(WITH_AMD_GPU)
+    hip_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      ${GLOB_OP_LIB})
+  else()
+    cc_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      ${GLOB_OP_LIB})
+    if(NOT APPLE AND NOT ANDROID)
+      target_link_libraries(paddle_pybind rt)
+    endif(NOT APPLE AND NOT ANDROID)
+  endif(WITH_AMD_GPU)
 endif(WITH_PYTHON)
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
old mode 100644
new mode 100755
index 6be2bd8fad..02f2d7ba12
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -37,6 +37,7 @@ function cmake_gen() {
         -DWITH_DSO=ON
         -DWITH_DOC=OFF
         -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -50,6 +51,7 @@ function cmake_gen() {
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DWITH_FAST_BUNDLE_TEST=ON
+	-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
     ========================================
 EOF
@@ -62,6 +64,7 @@ EOF
         -DWITH_DSO=ON \
         -DWITH_DOC=OFF \
         -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -74,6 +77,7 @@ EOF
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DWITH_FAST_BUNDLE_TEST=ON \
+	-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }
 

From 192cc5dd3260bede2ff9cadd90f9249d853f0cf0 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Tue, 13 Mar 2018 11:07:08 -0400
Subject: [PATCH 05/58] Implementation of MKLDNN LRN

---
 paddle/fluid/operators/lrn_mkldnn_op.cc       | 189 ++++++++++++++++++
 paddle/fluid/operators/lrn_op.cc              |  55 ++++-
 .../fluid/tests/unittests/test_lrn_op.py      |  10 +
 3 files changed, 253 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/lrn_mkldnn_op.cc

diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
new file mode 100644
index 0000000000..334597ab05
--- /dev/null
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/lrn_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+namespace {
+mkldnn::algorithm LRNAlgorithm(const paddle::framework::ExecutionContext& ctx) {
+  mkldnn::algorithm algorithm = mkldnn::lrn_across_channels;
+
+  std::string algorithm_str = ctx.Attr<std::string>("algorithm");
+  if (algorithm_str == "WITHIN_CHANNEL") {
+    algorithm = mkldnn::lrn_within_channel;
+  }
+  return algorithm;
+}
+}  // namespace
+
+template <typename T>
+class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x = ctx.Input<Tensor>("X");
+    auto out = ctx.Output<Tensor>("Out");
+    auto mid = ctx.Output<Tensor>("MidOut");
+
+    auto input_data = x->data<T>();
+    auto output_data = out->mutable_data<T>(ctx.GetPlace());
+    mid->mutable_data<T>(ctx.GetPlace());
+
+    const std::string key = ctx.op().Output("Out");
+    const std::string key_src_memory = key + "@lrn_src_memory";
+    const std::string key_pd = key + "@lrn_pd";
+    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+
+    auto algorithm = LRNAlgorithm(ctx);
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto forward_desc = mkldnn::lrn_forward::desc{
+        mkldnn::prop_kind::forward, algorithm, src_md, n, alpha, beta, k};
+
+    auto forward_pd = std::make_shared<mkldnn::lrn_forward::primitive_desc>(
+        forward_desc, mkldnn_engine);
+
+    dev_ctx.SetBlob(key_pd, forward_pd);
+
+    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto src_memory = std::make_shared<mkldnn::memory>(
+        src_memory_pd, static_cast<void*>(const_cast<float*>(input_data)));
+
+    dev_ctx.SetBlob(key_src_memory, src_memory);
+    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
+                                     static_cast<void*>(output_data)};
+
+    auto workspace_md = forward_pd->workspace_primitive_desc();
+    auto workspace_memory = std::make_shared<mkldnn::memory>(workspace_md);
+
+    dev_ctx.SetBlob(key_workspace_memory, workspace_memory);
+
+    auto forward_op = mkldnn::lrn_forward{*forward_pd, *src_memory,
+                                          *workspace_memory, dst_memory};
+
+    std::vector<mkldnn::primitive> pipeline = {forward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
+template <typename T>
+class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+
+    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_src_memory = key + "@lrn_src_memory";
+    const std::string key_pd = key + "@lrn_pd";
+    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+    auto out_grad_data = out_grad->data<T>();
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_memory =
+        mkldnn::memory{{diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<float*>(out_grad_data))};
+
+    auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
+                                          static_cast<void*>(x_grad_data)};
+
+    auto algorithm = LRNAlgorithm(ctx);
+
+    auto backward_desc = mkldnn::lrn_backward::desc{
+        algorithm, src_md, diff_src_md, n, alpha, beta, k};
+
+    auto forward_pd = dev_ctx.GetBlob(key_pd);
+
+    auto backward_pd = mkldnn::lrn_backward::primitive_desc{
+        backward_desc, mkldnn_engine,
+        *static_cast<mkldnn::lrn_forward::primitive_desc*>(forward_pd.get())};
+
+    std::shared_ptr<void> workspace_memory =
+        dev_ctx.GetBlob(key_workspace_memory);
+
+    auto src_memory = dev_ctx.GetBlob(key_src_memory);
+    auto backward_op = mkldnn::lrn_backward{
+        backward_pd, *static_cast<mkldnn::memory*>(src_memory.get()),
+        diff_dst_memory, *static_cast<mkldnn::memory*>(workspace_memory.get()),
+        diff_src_memory};
+
+    std::vector<mkldnn::primitive> pipeline = {backward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 692e85dcff..6bd451a118 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -135,6 +138,24 @@ class LRNOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("MidOut", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+    }
+#endif
+
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout_, library_);
+  }
 };
 
 template <typename T>
@@ -176,6 +197,21 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
                "beta is the power number.")
         .SetDefault(0.75)
         .GreaterThan(0.0);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddAttr<std::string>("algorithm",
+                         "(string default ACROSS_CHANNELS"
+                         "An optional string: \"ACROSS_CHANNELS\", "
+                         "\"WITHIN_CHANNEL\". Used by MKLDNN library")
+        .SetDefault("ACROSS_CHANNELS");
 
     AddComment(R"DOC(
 Local Response Normalization Operator.
@@ -223,8 +259,25 @@ class LRNOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
-};
 
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+    }
+#endif
+
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout_, library_);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index eaff45cbb2..2268eafdbd 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -87,5 +87,15 @@ class TestLRNOp(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+
+
 if __name__ == "__main__":
     unittest.main()

From c51c446221ce63890a0c099da7f26b9bfa41cb48 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Fri, 16 Mar 2018 10:05:54 -0400
Subject: [PATCH 06/58] Content of GetExpectedKernelType moved to standalone
 function

---
 paddle/fluid/operators/lrn_op.cc | 54 ++++++++++++++------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 6bd451a118..00db09ece3 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -119,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
 template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
 template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
 
+namespace {
+   framework::OpKernelType GetExpectedLRNKernel(
+      const framework::ExecutionContext& ctx) {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+    }
+#endif
+
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout_, library_);
+  }
+}
+
 class LRNOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -140,21 +160,8 @@ class LRNOp : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-    }
-#endif
-
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
   }
 };
 
@@ -261,21 +268,8 @@ class LRNOpGrad : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-    }
-#endif
-
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
   }
 };
 }  // namespace operators

From 2d95527527fe3b27e06f254965c8eb4fbacb4abf Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Mon, 19 Mar 2018 06:10:27 -0400
Subject: [PATCH 07/58] Removing WITHIN_CHANNEL algorithm for lrn. CPU lrn
 operator works only with ACROSS_CHANNELS

---
 paddle/fluid/operators/lrn_mkldnn_op.cc | 27 ++++++--------------
 paddle/fluid/operators/lrn_op.cc        | 33 +++++++++++--------------
 2 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index 334597ab05..a2971fcd14 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -22,18 +22,6 @@ namespace operators {
 using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
-namespace {
-mkldnn::algorithm LRNAlgorithm(const paddle::framework::ExecutionContext& ctx) {
-  mkldnn::algorithm algorithm = mkldnn::lrn_across_channels;
-
-  std::string algorithm_str = ctx.Attr<std::string>("algorithm");
-  if (algorithm_str == "WITHIN_CHANNEL") {
-    algorithm = mkldnn::lrn_within_channel;
-  }
-  return algorithm;
-}
-}  // namespace
-
 template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -64,8 +52,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const float beta = ctx.Attr<float>("beta");
     const float k = ctx.Attr<float>("k");
 
-    auto algorithm = LRNAlgorithm(ctx);
-
     auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
     e_mid = e_mid.constant(k);
 
@@ -77,8 +63,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_md = paddle::platform::MKLDNNMemDesc(
         dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
 
-    auto forward_desc = mkldnn::lrn_forward::desc{
-        mkldnn::prop_kind::forward, algorithm, src_md, n, alpha, beta, k};
+    auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
+                                                  mkldnn::lrn_across_channels,
+                                                  src_md,
+                                                  n,
+                                                  alpha,
+                                                  beta,
+                                                  k};
 
     auto forward_pd = std::make_shared<mkldnn::lrn_forward::primitive_desc>(
         forward_desc, mkldnn_engine);
@@ -154,10 +145,8 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
                                           static_cast<void*>(x_grad_data)};
 
-    auto algorithm = LRNAlgorithm(ctx);
-
     auto backward_desc = mkldnn::lrn_backward::desc{
-        algorithm, src_md, diff_src_md, n, alpha, beta, k};
+        mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k};
 
     auto forward_pd = dev_ctx.GetBlob(key_pd);
 
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 00db09ece3..bd72f0435e 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -120,24 +120,24 @@ template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
 template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
 
 namespace {
-   framework::OpKernelType GetExpectedLRNKernel(
-      const framework::ExecutionContext& ctx) {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
+framework::OpKernelType GetExpectedLRNKernel(
+    const framework::ExecutionContext& ctx) {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-    }
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+  }
 #endif
 
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
-  }
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
 }
+}  // namespace
 
 class LRNOp : public framework::OperatorWithKernel {
  public:
@@ -214,11 +214,6 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
-    AddAttr<std::string>("algorithm",
-                         "(string default ACROSS_CHANNELS"
-                         "An optional string: \"ACROSS_CHANNELS\", "
-                         "\"WITHIN_CHANNEL\". Used by MKLDNN library")
-        .SetDefault("ACROSS_CHANNELS");
 
     AddComment(R"DOC(
 Local Response Normalization Operator.

From e50205e744753f5a6c93f49bd74e00aa7cc642d2 Mon Sep 17 00:00:00 2001
From: sabreshao <sabre.shao@amd.com>
Date: Tue, 20 Mar 2018 13:46:48 +0800
Subject: [PATCH 08/58] CMake refine for HIP support.

1. Add option WITH_AMD_GPU.
2. Add cmake/hip.cmake for HIP toolchain.
3. Some external module such as eigen may need HIP port.
4. Add macro hip_library/hip_binary/hip_test to cmake/generic.cmake.
5. Add one HIP source concat.hip.cu as an example. Each .cu may have its corresponding .hip.cu.
---
 CMakeLists.txt                            |   3 -
 cmake/external/eigen.cmake                |   4 +-
 cmake/hip.cmake                           |   3 -
 paddle/fluid/operators/CMakeLists.txt     |  33 ++-
 paddle/fluid/operators/math/concat.hip.cu | 268 +---------------------
 paddle/scripts/docker/build.sh            |   4 +-
 6 files changed, 33 insertions(+), 282 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 399bf50748..1e11f86d0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,9 +70,6 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
-if(WITH_AMD_GPU)
-endif()
-
 if(ANDROID OR IOS)
     if(ANDROID)
         if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 5d88c5a0b0..73d70c34dc 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -1,8 +1,8 @@
 INCLUDE(ExternalProject)
 
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
-
-INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
+SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
+INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
 
 if(WITH_AMD_GPU)
     ExternalProject_Add(
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index cd880603a7..bfe491bd6b 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -27,9 +27,6 @@ endif(WITH_TESTING)
 
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-# Disable optimization since one eigen symbol will be removed in math_function.cu
-    #list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
     list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 26d1dab1e9..c0245379ac 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -12,6 +12,8 @@ function(op_library TARGET)
     set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
     set(cu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(CUDNN_FILE)
@@ -36,10 +38,19 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
         string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
             list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
         endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
         if(WITH_MKLDNN)
             string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
@@ -48,10 +59,14 @@ function(op_library TARGET)
         endif()
     else()
         foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.cu$")
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
             elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                 list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
             elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
@@ -77,8 +92,8 @@ function(op_library TARGET)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS
-                ${op_library_DEPS} ${op_common_deps})
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
             ${op_common_deps})
@@ -91,7 +106,7 @@ function(op_library TARGET)
         endif()
     endforeach()
 
-    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -117,7 +132,10 @@ function(op_library TARGET)
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -128,6 +146,11 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
+
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
diff --git a/paddle/fluid/operators/math/concat.hip.cu b/paddle/fluid/operators/math/concat.hip.cu
index 91efd8ea57..eacef04388 100644
--- a/paddle/fluid/operators/math/concat.hip.cu
+++ b/paddle/fluid/operators/math/concat.hip.cu
@@ -12,270 +12,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hip/hip_runtime.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/operators/math/concat.h"
-#include "paddle/fluid/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__device__ T upper_bound(const T* first, T count, T val) {
-  const T* orig = first;
-  const T* it = nullptr;
-  T step = 0;
-  while (count > 0) {
-    it = first;
-    step = count / 2;
-    it += step;
-    if (!(val < *it)) {
-      first = ++it;
-      count -= step + 1;
-    } else {
-      count = step;
-    }
-  }
-  return first - orig;
-}
-
-template <typename T>
-__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
-                             const int output_rows, const int output_cols,
-                             T* output) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
-
-  int curr_offset = input_cols[segment];
-  int curr_segment = segment;
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-    T* input_ptr = inputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * segment_width + local_col];
-  }
-}
-
-template <typename T>
-__global__ void KernelConcat(T** inputs, const int input_col,
-                             const int output_rows, const int output_cols,
-                             T* output) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* input_ptr = inputs[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * input_col + in_offset];
-    }
-  }
-}
-
-template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int* output_cols,
-                                 int col_size, T** outputs) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
-  int curr_offset = output_cols[segment];
-  int curr_segment = segment;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * segment_width + local_col] =
-          input[tid_y * input_col + tid_x];
-  }
-}
-
-template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int output_cols,
-                                 T** outputs) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* output_ptr = outputs[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * output_cols + in_offset] =
-          input[tid_y * input_col + tid_x];
-  }
-}
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
- */
-template <typename T>
-class ConcatFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::vector<framework::Tensor>& input, const int axis,
-                  framework::Tensor* output) {
-    // TODO(zcd): Add input data validity checking
-    int num = input.size();
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int cols = input[0].numel() / rows;
-    int out_rows = rows, out_cols = 0;
-
-    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_cols(num + 1);
-    inputs_cols[0] = 0;
-    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
-
-    bool sameShape = true;
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      if (sameShape) {
-        if (t_cols != cols) sameShape = false;
-      }
-      out_cols += t_cols;
-      inputs_cols[i + 1] = out_cols;
-      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
-    }
-
-    T** ins_gpu =
-        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
-
-    // computation
-    // set the thread block and grid according to CurrentDeviceId
-    const int kThreadsPerBlock = 1024;
-    int block_cols = kThreadsPerBlock;
-    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((out_cols + 31) >> 5) << 5;
-    }
-    int block_rows = kThreadsPerBlock / block_cols;
-    dim3 block_size = dim3(block_cols, block_rows, 1);
-
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-    int grid_cols =
-        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
-    int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
-    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
-
-    if (sameShape) {
-      hipLaunchKernelGGL((KernelConcat<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
-          ins_gpu, cols, out_rows, out_cols, output->data<T>());
-    } else {
-      hipLaunchKernelGGL((KernelConcat<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
-          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
-          out_cols, output->data<T>());
-    }
-  }
-};
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
- */
-template <typename T>
-class ConcatGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>& outputs) {
-    // TODO(zcd): Add input data validity checking
-    int num = outputs.size();
-    int input_row = 1;
-    auto dim_0 = outputs[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      input_row *= dim_0[i];
-    }
-
-    int output_col_0 = outputs[0].numel() / input_row;
-    int input_col = 0;
-    bool sameShape = true;
-
-    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(num + 1);
-    outputs_cols[0] = 0;
-    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
-
-    for (int i = 0; i < num; ++i) {
-      int t_col = outputs[i].numel() / input_row;
-      if (sameShape) {
-        if (t_col != output_col_0) sameShape = false;
-      }
-      input_col += t_col;
-      outputs_cols[i + 1] = input_col;
-      outputs_ptr[i] = outputs[i].data<T>();
-    }
-
-    T** outs_gpu =
-        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
-
-    // computation
-    const int kThreadsPerBlock = 1024;
-    int block_cols = kThreadsPerBlock;
-    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((input_col + 31) >> 5) << 5;
-    }
-    int block_rows = kThreadsPerBlock / block_cols;
-    dim3 block_size = dim3(block_cols, block_rows, 1);
-
-    int max_threads = context.GetMaxPhysicalThreadCount();
-    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-    int grid_cols =
-        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
-    int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
-    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
-
-    if (sameShape) {
-      hipLaunchKernelGGL((KernelConcatGrad<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
-          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
-    } else {
-      hipLaunchKernelGGL((KernelConcatGrad<T>), dim3(grid_size), dim3(block_size), 0, context.stream(),
-          input.data<T>(), input_row, input_col, outs_col_gpu,
-          static_cast<int>(outputs_cols.size()), outs_gpu);
-    }
-  }
-};
-
-template class ConcatFunctor<platform::CUDADeviceContext, int>;
-template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
-template class ConcatFunctor<platform::CUDADeviceContext, float>;
-template class ConcatFunctor<platform::CUDADeviceContext, double>;
-
-template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
-template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
-template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
-template class ConcatGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+#include <hip/hip_runtime.h>
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 02f2d7ba12..a0fc391c7c 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -51,7 +51,7 @@ function cmake_gen() {
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DWITH_FAST_BUNDLE_TEST=ON
-	-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
     ========================================
 EOF
@@ -77,7 +77,7 @@ EOF
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DWITH_FAST_BUNDLE_TEST=ON \
-	-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }
 

From 236b7dd2bde254f83479ca632756b4dfaa1b8bdc Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Tue, 20 Mar 2018 14:28:07 +0800
Subject: [PATCH 09/58] add pinned memory

---
 .../fluid/memory/detail/system_allocator.cc   | 41 ++++++++++++++
 paddle/fluid/memory/detail/system_allocator.h | 12 +++++
 paddle/fluid/memory/memory.cc                 | 53 ++++++++++++++++---
 paddle/fluid/memory/memory.h                  | 12 +++--
 4 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 8ac8978120..df9d28ede8 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -119,6 +119,47 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool GPUAllocator::UseGpu() const { return true; }
 
+void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
+  if (size <= 0) return nullptr;
+  void* p;
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+
+  size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
+  }
+
+  return nullptr;
+}
+
+void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+  PADDLE_ASSERT(index == 1);
+
+  PADDLE_ASSERT(fallback_alloc_size_ >= size);
+  fallback_alloc_size_ -= size;
+  err = cudaFreeHost(p);
+
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFree after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFree succeeds.
+  if (err != cudaErrorCudartUnloading) {
+    PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
+  }
+}
+
+bool CUDAPinnedAllocator::UseGpu() const { return true; }
+
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index e93c2c1e32..3e024125fa 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator {
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;
 };
+
+class CUDAPinnedAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ =
+      0;  // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
+  size_t fallback_alloc_size_ = 0;
+};
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
index d07f89439a..c5577587aa 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 }
 
 template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
+                                bool use_pinned) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
   VLOG(10) << "  pointer=" << p;
@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
 }
 
 template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
+                              bool use_pinned) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   return as[gpu_id];
 }
 
+BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      as[gpu] = nullptr;
+    }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
+    VLOG(10) << "\n\nNOTE: each GPU device use "
+             << FLAGS_fraction_of_gpu_memory_to_use * 100
+             << "% of GPU memory.\n"
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
+             << "' to change the fraction of GPU usage.\n\n";
+  }
+  return as[gpu_id];
+}
+
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
 template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
-  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  auto* ptr = buddy_allocator->Alloc(size);
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
+                                 bool use_pinned) {
+  void* ptr;
+  if (use_pinned) {
+    auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
+    ptr = buddy_allocator->Alloc(size);
+  } else {
+    auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+    ptr = buddy_allocator->Alloc(size);
+  }
+
   if (ptr == nullptr) {
     int cur_dev = platform::GetCurrentDeviceId();
     platform::SetDeviceId(place.device);
@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
 }
 
 template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
+                               bool use_pinned) {
+  if (use_pinned) {
+    GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
+  } else {
+    GetGPUBuddyAllocator(place.device)->Free(p);
+  }
 }
 
 #endif
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
index 7c5db815d6..9bc48ac68f 100644
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -33,7 +33,7 @@ namespace memory {
  *          address is valid or not.
  */
 template <typename Place>
-void* Alloc(Place place, size_t size);
+void* Alloc(Place place, size_t size, bool use_pinned = false);
 
 /**
  * \brief   Free memory block in one place.
@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
  *
  */
 template <typename Place>
-void Free(Place place, void* ptr);
+void Free(Place place, void* ptr, bool use_pinned = false);
 
 /**
  * \brief   Total size of used memory in one place.
@@ -74,11 +74,15 @@ class PODDeleter {
   static_assert(std::is_pod<T>::value, "T must be POD");
 
  public:
-  explicit PODDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+  explicit PODDeleter(Place place, bool use_pinned = false)
+      : place_(place), use_pinned_(use_pinned) {}
+  void operator()(T* ptr) {
+    Free(place_, static_cast<void*>(ptr), use_pinned_);
+  }
 
  private:
   Place place_;
+  bool use_pinned_;
 };
 
 /**

From eaa90d38ad121ae019688f024380526cf7d504c8 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Tue, 20 Mar 2018 15:12:15 +0800
Subject: [PATCH 10/58] add use_pinned

---
 paddle/fluid/framework/tensor.h      | 32 +++++++++++++++++++---------
 paddle/fluid/framework/tensor_impl.h | 23 ++++++++++++--------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6f878541e6..aa8f44ea30 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -45,10 +45,11 @@ class Tensor {
   friend struct EigenVector;
 
  public:
-  Tensor() : offset_(0) {}
+  Tensor() : offset_(0), use_pinned_(false) {}
 
   /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place) : offset_(0) {
+  explicit Tensor(const platform::Place& place)
+      : offset_(0), use_pinned_(false) {
     holder_->set_place(place);
   }
 
@@ -69,11 +70,12 @@ class Tensor {
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place);
+  inline T* mutable_data(platform::Place place, bool use_pinned = false);
 
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  inline void* mutable_data(platform::Place place, std::type_index type,
+                            bool use_pinned = false);
 
-  inline void* mutable_data(platform::Place place);
+  inline void* mutable_data(platform::Place place, bool use_pinned = false);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -84,7 +86,8 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  inline T* mutable_data(DDim dims, platform::Place place,
+                         bool use_pinned = false);
 
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
@@ -92,6 +95,9 @@ class Tensor {
   /*! Return the numel of the memory block. */
   inline int64_t numel() const;
 
+  /*! Return the numel of the memory block. */
+  inline bool isPinned() const;
+
   /*! Resize the dimensions of the memory block. */
   inline Tensor& Resize(const DDim& dims);
 
@@ -146,12 +152,14 @@ class Tensor {
 
   template <typename Place>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type)
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
-               memory::PODDeleter<uint8_t, Place>(place)),
+    PlaceholderImpl(Place place, size_t size, std::type_index type,
+                    bool use_pinned = false)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, use_pinned)),
+               memory::PODDeleter<uint8_t, Place>(place, use_pinned)),
           place_(place),
           size_(size),
-          type_(type) {
+          type_(type),
+          use_pinned_(use_pinned) {
       PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                               (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
@@ -174,6 +182,9 @@ class Tensor {
 
     /* the current type of memory */
     std::type_index type_;
+
+    /*! use pinned memory or not. */
+    bool use_pinned_;
   };
 
   /*! holds the memory block if allocated. */
@@ -208,6 +219,7 @@ class Tensor {
    *          PlaceHolder::ptr_ and where the tensor data really begins.
    */
   size_t offset_;
+  bool use_pinned_;
 };
 
 inline void Tensor::switch_place(platform::Place new_place) {
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 638bd0db9d..e882cce69e 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -101,19 +101,21 @@ inline T* Tensor::data() {
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               bool use_pinned) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
-  return mutable_data<T>(place);
+  return mutable_data<T>(place, use_pinned);
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
+inline T* Tensor::mutable_data(platform::Place place, bool use_pinned) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), use_pinned));
 }
 
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                                  bool use_pinned) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
@@ -127,26 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
       holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
+          boost::get<platform::CPUPlace>(place), size, type, use_pinned));
     } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
       PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
     }
 #else
       holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type));
+          boost::get<platform::CUDAPlace>(place), size, type, use_pinned));
     }
 #endif
     offset_ = 0;
+    use_pinned_ = use_pinned;
   }
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  offset_);
 }
 
-inline void* Tensor::mutable_data(platform::Place place) {
+inline void* Tensor::mutable_data(platform::Place place, bool use_pinned) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
                  "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type());
+  return mutable_data(place, holder_->type(), use_pinned);
 }
 
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; }
 
 inline int64_t Tensor::numel() const { return product(dims_); }
 
+inline bool Tensor::isPinned() const { return use_pinned_; }
+
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);

From 18461d093505f2b889cfae3ae99ea55c12afe540 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 21 Mar 2018 10:48:46 +0800
Subject: [PATCH 11/58] wip

---
 paddle/fluid/operators/listen_and_serv_op.cc | 42 ++++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index a594de67e0..bd6e25449f 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -95,6 +95,13 @@ class ListenAndServOp : public framework::OperatorBase {
                       "server program should have at least 2 blocks");
 
     framework::Executor executor(dev_place);
+    std::vector<framework::ExecutorPrepareContext *> blk_ctx_list;
+    blk_ctx_list.push_back(nullptr);  // block0 is not used.
+    for (int blkid = 1; blkid < num_blocks; ++blkid) {
+      auto *exe_ctx = executor.Prepare(*program, blkid);
+      VLOG(2) << "prepare ctx: " << exe_ctx;
+      blk_ctx_list.push_back(exe_ctx);
+    }
 
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
@@ -145,23 +152,30 @@ class ListenAndServOp : public framework::OperatorBase {
       std::vector<std::future<void>> fs;
       // block0 contains only listen_and_serv op, start run from block1.
       for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
-        fs.push_back(framework::Async([&executor, &program, &recv_scope,
-                                       blkid]() {
-          int run_block = blkid;  // thread local
-          try {
-            executor.Run(*program, &recv_scope, run_block,
-                         false /*create_local_scope*/, false /*create_vars*/);
-          } catch (std::exception &e) {
-            LOG(ERROR) << "run sub program error " << e.what();
-          }
-        }));
+        fs.push_back(framework::Async(
+            [&executor, &program, &recv_scope, &blk_ctx_list, blkid]() {
+              int run_block = blkid;  // thread local
+              try {
+                VLOG(2) << "run ctx: " << blk_ctx_list[run_block]
+                        << " block: " << run_block;
+                executor.RunPreparedContext(blk_ctx_list[run_block],
+                                            &recv_scope, false, false);
+                // executor.Run(*program, &recv_scope, run_block,
+                //              false /*create_local_scope*/,
+                //              false /*create_vars*/);
+              } catch (std::exception &e) {
+                LOG(ERROR) << "run sub program error " << e.what();
+              }
+            }));
       }
       for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
       // Run global block at final step, or block1 if there are only 2 blocks
       if (num_blocks >= 2) {
         try {
-          executor.Run(*program, &recv_scope, num_blocks - 1,
-                       false /*create_local_scope*/, false /*create_vars*/);
+          executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope,
+                                      false, false);
+          // executor.Run(*program, &recv_scope, num_blocks - 1,
+          //              false /*create_local_scope*/, false /*create_vars*/);
         } catch (std::exception &e) {
           LOG(ERROR) << "run sub program error " << e.what();
         }
@@ -180,6 +194,10 @@ class ListenAndServOp : public framework::OperatorBase {
       rpc_service_->WaitClientGet(fan_in);
       sparse_vars.clear();
     }  // while(true)
+
+    for (int i = 0; i < num_blocks; ++i) {
+      delete blk_ctx_list[i];
+    }
   }
 
  protected:

From e9d815e32b7cdb6e030bfd3aa649d3327bf4f195 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 21 Mar 2018 14:46:10 +0800
Subject: [PATCH 12/58] prepare and create op before run

---
 paddle/fluid/operators/listen_and_serv_op.cc | 9 +--------
 paddle/fluid/operators/send_op.cc            | 1 +
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index bd6e25449f..da44128cdd 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -99,7 +99,6 @@ class ListenAndServOp : public framework::OperatorBase {
     blk_ctx_list.push_back(nullptr);  // block0 is not used.
     for (int blkid = 1; blkid < num_blocks; ++blkid) {
       auto *exe_ctx = executor.Prepare(*program, blkid);
-      VLOG(2) << "prepare ctx: " << exe_ctx;
       blk_ctx_list.push_back(exe_ctx);
     }
 
@@ -149,6 +148,7 @@ class ListenAndServOp : public framework::OperatorBase {
       // should be global ops.
       // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
       // and this will still work.
+
       std::vector<std::future<void>> fs;
       // block0 contains only listen_and_serv op, start run from block1.
       for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
@@ -156,13 +156,8 @@ class ListenAndServOp : public framework::OperatorBase {
             [&executor, &program, &recv_scope, &blk_ctx_list, blkid]() {
               int run_block = blkid;  // thread local
               try {
-                VLOG(2) << "run ctx: " << blk_ctx_list[run_block]
-                        << " block: " << run_block;
                 executor.RunPreparedContext(blk_ctx_list[run_block],
                                             &recv_scope, false, false);
-                // executor.Run(*program, &recv_scope, run_block,
-                //              false /*create_local_scope*/,
-                //              false /*create_vars*/);
               } catch (std::exception &e) {
                 LOG(ERROR) << "run sub program error " << e.what();
               }
@@ -174,8 +169,6 @@ class ListenAndServOp : public framework::OperatorBase {
         try {
           executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope,
                                       false, false);
-          // executor.Run(*program, &recv_scope, num_blocks - 1,
-          //              false /*create_local_scope*/, false /*create_vars*/);
         } catch (std::exception &e) {
           LOG(ERROR) << "run sub program error " << e.what();
         }
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 443f40e803..2df25ae5a6 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -66,6 +66,7 @@ class SendOp : public framework::OperatorBase {
     auto* client_var = scope.FindVar(client_var_name);
     detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
 
+    ctx.Wait();  // wait before sending
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];

From 1eec9261245028b48fb0b6bc80c85e8bd87851d4 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 21 Mar 2018 14:52:16 +0800
Subject: [PATCH 13/58] updates

---
 paddle/fluid/operators/send_op.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 2df25ae5a6..443f40e803 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -66,7 +66,6 @@ class SendOp : public framework::OperatorBase {
     auto* client_var = scope.FindVar(client_var_name);
     detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
 
-    ctx.Wait();  // wait before sending
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];

From 2a4221ac074f50a242bdc988eab49cca17414fcb Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 21 Mar 2018 20:00:29 +0800
Subject: [PATCH 14/58] split send op to send_vars and send_barrier

---
 paddle/fluid/operators/CMakeLists.txt     |   4 +
 paddle/fluid/operators/send_barrier_op.cc | 103 +++++++++++++++++
 paddle/fluid/operators/send_vars_op.cc    | 132 ++++++++++++++++++++++
 3 files changed, 239 insertions(+)
 create mode 100644 paddle/fluid/operators/send_barrier_op.cc
 create mode 100644 paddle/fluid/operators/send_vars_op.cc

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d30124d4a3..254f89d987 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -156,6 +156,10 @@ if(WITH_DISTRIBUTE)
     set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
     set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
new file mode 100644
index 0000000000..8d02a6f291
--- /dev/null
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+
+class SendBarrierOp : public framework::OperatorBase {
+ public:
+  SendBarrierOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
+    // need to wait before sending send_barrier message
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    for (auto& ep : eps) {
+      VLOG(3) << "send barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+  }
+};
+
+class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SendBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
+                  ops::SendBarrierOpVarTypeInference,
+                  ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
new file mode 100644
index 0000000000..af791bc8e2
--- /dev/null
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+static bool NeedSend(const framework::Scope& scope,
+                     const std::string& varname) {
+  auto* var = scope.FindVar(varname);
+  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
+                          varname);
+  if (var->IsType<framework::LoDTensor>()) {
+    return var->Get<framework::LoDTensor>().IsInitialized();
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
+  } else {
+    PADDLE_THROW(
+        "Variable type in send side should be in "
+        "[LodTensor, SelectedRows]");
+  }
+  return false;
+}
+
+class SendVarsOp : public framework::OperatorBase {
+ public:
+  SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int flag_wait = Attr<int>("wait");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
+    for (size_t i = 0; i < ins.size(); i++) {
+      if (NeedSend(scope, ins[i])) {
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      } else {
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+      }
+    }
+    if (flag_wait) {
+      rpc_client->Wait();
+    }
+  }
+};
+
+class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
+        .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
+    AddComment(R"DOC(
+Send operator
+
+This operator will send variables to listen_and_serve op at the parameter server.
+)DOC");
+    AddAttr<int>("wait",
+                 "(int, default 0)"
+                 "whether watting for all send request have been sent.")
+        .SetDefault(0);
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class SendVarsOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SendVarsOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
+                  paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
+                  ops::SendVarsOpVarTypeInference,
+                  ops::SendVarsOpShapeInference);

From 72cc64e40e5d624bcc97bd81f144fcb446167a21 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Wed, 21 Mar 2018 10:20:29 -0400
Subject: [PATCH 15/58] Device blobs are created only in training. Added
 testing attribute

---
 paddle/fluid/operators/lrn_mkldnn_op.cc | 71 ++++++++++++++++++-------
 paddle/fluid/operators/lrn_op.cc        |  1 +
 2 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index a2971fcd14..3bead16ce4 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -22,6 +22,22 @@ namespace operators {
 using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
+namespace {
+template <typename T, typename... Args>
+std::shared_ptr<T> insert_to_context(const std::string& key,
+                                     const MKLDNNDeviceContext& dev_ctx,
+                                     Args&&... args) {
+  auto p = std::static_pointer_cast<T, void>(dev_ctx.GetBlob(key));
+
+  if (!p) {
+    p = std::make_shared<T>(args...);
+    dev_ctx.SetBlob(key, std::static_pointer_cast<void, T>(p));
+  }
+
+  return p;
+}
+}  // namespace
+
 template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -42,15 +58,11 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto output_data = out->mutable_data<T>(ctx.GetPlace());
     mid->mutable_data<T>(ctx.GetPlace());
 
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_src_memory = key + "@lrn_src_memory";
-    const std::string key_pd = key + "@lrn_pd";
-    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
-
     const int n = ctx.Attr<int>("n");
     const float alpha = ctx.Attr<float>("alpha");
     const float beta = ctx.Attr<float>("beta");
     const float k = ctx.Attr<float>("k");
+    const bool is_test = ctx.Attr<bool>("is_test");
 
     auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
     e_mid = e_mid.constant(k);
@@ -71,28 +83,47 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                   beta,
                                                   k};
 
-    auto forward_pd = std::make_shared<mkldnn::lrn_forward::primitive_desc>(
-        forward_desc, mkldnn_engine);
-
-    dev_ctx.SetBlob(key_pd, forward_pd);
-
     auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto src_memory = std::make_shared<mkldnn::memory>(
-        src_memory_pd, static_cast<void*>(const_cast<float*>(input_data)));
-
-    dev_ctx.SetBlob(key_src_memory, src_memory);
     auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
                                      static_cast<void*>(output_data)};
 
-    auto workspace_md = forward_pd->workspace_primitive_desc();
-    auto workspace_memory = std::make_shared<mkldnn::memory>(workspace_md);
+    std::unique_ptr<mkldnn::lrn_forward> forward_op = nullptr;
+
+    if (!is_test) {
+      const std::string key = ctx.op().Output("Out");
+      const std::string key_src_memory = key + "@lrn_src_memory";
+      const std::string key_pd = key + "@lrn_pd";
+      const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+      auto forward_pd = insert_to_context<mkldnn::lrn_forward::primitive_desc>(
+          key_pd, dev_ctx, forward_desc, mkldnn_engine);
+
+      auto src_memory = insert_to_context<mkldnn::memory>(
+          key_src_memory, dev_ctx, src_memory_pd);
+
+      src_memory->set_data_handle(
+          static_cast<void*>(const_cast<T*>(input_data)));
+
+      auto workspace_memory = insert_to_context<mkldnn::memory>(
+          key_workspace_memory, dev_ctx,
+          forward_pd->workspace_primitive_desc());
+
+      forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory,
+                                               *workspace_memory, dst_memory});
 
-    dev_ctx.SetBlob(key_workspace_memory, workspace_memory);
+    } else {
+      auto forward_pd =
+          mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
+      auto src_memory = mkldnn::memory{
+          src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
+      auto workspace_memory =
+          mkldnn::memory{forward_pd.workspace_primitive_desc()};
 
-    auto forward_op = mkldnn::lrn_forward{*forward_pd, *src_memory,
-                                          *workspace_memory, dst_memory};
+      forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory,
+                                               workspace_memory, dst_memory});
+    }
 
-    std::vector<mkldnn::primitive> pipeline = {forward_op};
+    std::vector<mkldnn::primitive> pipeline = {*forward_op};
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
 };
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index bd72f0435e..2b1947a187 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -214,6 +214,7 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
+    AddAttr<bool>("is_test", "").SetDefault(false);
 
     AddComment(R"DOC(
 Local Response Normalization Operator.

From e0ac6bc436725a7750b46a674b97b89cccdef36b Mon Sep 17 00:00:00 2001
From: sabreshao <sabre.shao@amd.com>
Date: Thu, 22 Mar 2018 10:48:27 +0800
Subject: [PATCH 16/58] CMake refine for HIP support. Fix CI.

---
 paddle/fluid/pybind/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index d523ad7f73..fe991033df 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,12 +1,12 @@
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
       DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
       DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)

From 5e6276edc1d92632322d6e748f281b9156251671 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 22 Mar 2018 15:17:18 +0800
Subject: [PATCH 17/58] fix transpiler bug

---
 paddle/fluid/operators/send_op.cc            | 8 ++++----
 python/paddle/fluid/distribute_transpiler.py | 7 +++++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 443f40e803..a77c38f633 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -68,7 +68,7 @@ class SendOp : public framework::OperatorBase {
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
         rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
@@ -77,20 +77,20 @@ class SendOp : public framework::OperatorBase {
     PADDLE_ENFORCE(rpc_client->Wait());
 
     for (auto& ep : endpoints) {
-      VLOG(3) << "batch barrier, ep: " << ep;
+      VLOG(2) << "batch barrier, ep: " << ep;
       rpc_client->AsyncSendBatchBarrier(ep);
     }
     PADDLE_ENFORCE(rpc_client->Wait());
 
     if (outs.size() > 0) {
       for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
         rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
       }
       PADDLE_ENFORCE(rpc_client->Wait());
       // tell pservers that current trainer have called fetch
       for (auto& ep : endpoints) {
-        VLOG(3) << "send fetch barrier, ep: " << ep;
+        VLOG(2) << "send fetch barrier, ep: " << ep;
         rpc_client->AsyncSendFetchBarrier(ep);
       }
       PADDLE_ENFORCE(rpc_client->Wait());
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index ad655ee96c..4c3789b99e 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -563,6 +563,8 @@ class DistributeTranspiler:
         orig_var_name = ""
         if suff_idx >= 0:
             orig_var_name = varname[:suff_idx]
+        else:
+            orig_var_name = varname
         return orig_var_name
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
@@ -577,7 +579,8 @@ class DistributeTranspiler:
                 grad_block = None
                 for g in self.param_grad_ep_mapping[endpoint]["grads"]:
                     if same_or_split_var(
-                            self._orig_varname(g.name), opt_op.input(key)[0]):
+                            self._orig_varname(g.name),
+                            self._orig_varname(opt_op.input(key)[0])):
                         grad_block = g
                         break
                 if not grad_block:
@@ -748,7 +751,7 @@ class DistributeTranspiler:
         param_names = [
             p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
         ]
-        if op.input("Param") in param_names:
+        if op.input("Param")[0] in param_names:
             return True
         else:
             for n in param_names:

From a88cc462219681cbc74d2beee022e8c67d8f0de6 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 22 Mar 2018 16:14:37 +0800
Subject: [PATCH 18/58] update

---
 paddle/fluid/operators/detail/bytebuffer_stream.h |  5 +++--
 paddle/fluid/operators/detail/grpc_server.h       | 10 +++-------
 paddle/fluid/operators/detail/test_serde.cc       |  4 ++--
 paddle/fluid/operators/detail/variable_response.h |  4 ++--
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h
index 0cbe514d04..1791a48aab 100644
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -146,8 +146,9 @@ class GrpcByteBufferSource
 
 class GrpcByteBufferSourceWrapper : public Source {
  public:
-  GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {}
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
     return source_;
   }
 
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 9c21a07432..10e6dd45a9 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -21,15 +21,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-
+#include "paddle/fluid/operators/detail/grpc_service.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-#include "paddle/fluid/operators/detail/grpc_service.h"
-
-//#include <grpc/support/log.h>
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc
index 4be5963794..494ac1d679 100644
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
   framework::Scope scope;
   scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
   EXPECT_EQ(resp.Parse(msg), 0);
 
   framework::Variable* var2 = resp.GetVar();
@@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // deserialize zero-copy
   framework::Scope scope;
   scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
   if (from_type == 0) {
     EXPECT_EQ(resp.Parse(msg), 0);
   } else {
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index c7bc7a46e7..e121ed7bce 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -36,9 +36,9 @@ class VariableResponse {
  public:
   VariableResponse(const framework::Scope* scope,
                    const platform::DeviceContext* dev_ctx)
-      : scope_(scope), dev_ctx_(dev_ctx){};
+      : scope_(scope), dev_ctx_(dev_ctx) {}
 
-  virtual ~VariableResponse(){};
+  virtual ~VariableResponse() {}
 
   // return:
   // 0:ok.

From ee7f1ecd7cb79d34a7f14a45d4c34e4e6db9b7af Mon Sep 17 00:00:00 2001
From: Yancey <yanxu05@baidu.com>
Date: Thu, 22 Mar 2018 19:21:43 +0800
Subject: [PATCH 19/58] Fix dist compile error (#9320)

---
 .../operators/detail/bytebuffer_stream.h      |  5 +++--
 paddle/fluid/operators/detail/grpc_server.h   |  2 --
 paddle/fluid/operators/detail/test_serde.cc   | 21 +++++++++----------
 .../operators/detail/variable_response.h      |  4 ++--
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h
index 0cbe514d04..1791a48aab 100644
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -146,8 +146,9 @@ class GrpcByteBufferSource
 
 class GrpcByteBufferSourceWrapper : public Source {
  public:
-  GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {}
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
     return source_;
   }
 
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 9c21a07432..5c278f0ed7 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -29,8 +29,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detail/grpc_service.h"
 
-//#include <grpc/support/log.h>
-
 namespace paddle {
 namespace operators {
 namespace detail {
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc
index 4be5963794..99c1577223 100644
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
   framework::Scope scope;
   scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
   EXPECT_EQ(resp.Parse(msg), 0);
 
   framework::Variable* var2 = resp.GetVar();
@@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // deserialize zero-copy
   framework::Scope scope;
   scope.Var("myvar");
-  operators::detail::TensorResponse resp(&scope, &ctx);
+  operators::detail::VariableResponse resp(&scope, &ctx);
   if (from_type == 0) {
     EXPECT_EQ(resp.Parse(msg), 0);
   } else {
@@ -194,24 +194,23 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
 }
 
-TEST(LodTensor, GPU) {
-  platform::CUDAPlace place;
+TEST(LodTensor, Run) {
+  platform::CPUPlace place;
   RunTestLodTensor(place);
   RunTestLodTensor(place, 1);
-}
-
-TEST(LodTensor, CPU) {
-  platform::CPUPlace place;
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace place;
   RunTestLodTensor(place);
   RunTestLodTensor(place, 1);
+#endif
 }
 
-TEST(SelectedRows, CPU) {
+TEST(SelectedRows, Run) {
   platform::CPUPlace place;
   RunSerdeTestSelectedRows(place);
-}
 
-TEST(SelectedRows, GPU) {
+#ifdef PADDLE_WITH_CUDA
   platform::CUDAPlace place;
   RunSerdeTestSelectedRows(place);
+#endif
 }
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index c7bc7a46e7..e121ed7bce 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -36,9 +36,9 @@ class VariableResponse {
  public:
   VariableResponse(const framework::Scope* scope,
                    const platform::DeviceContext* dev_ctx)
-      : scope_(scope), dev_ctx_(dev_ctx){};
+      : scope_(scope), dev_ctx_(dev_ctx) {}
 
-  virtual ~VariableResponse(){};
+  virtual ~VariableResponse() {}
 
   // return:
   // 0:ok.

From e33af2414b1ae92de4c1589e3829a6bcc515dd21 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Thu, 22 Mar 2018 04:34:16 -0700
Subject: [PATCH 20/58] "fast hack"

---
 paddle/fluid/operators/dropout_op.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index f6c85a2a53..94382739b5 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -33,6 +33,7 @@ __global__ void RandomGenerator(const size_t n, const int seed,
 
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < n; idx += blockDim.x * gridDim.x) {
+    rng.discard(idx);
     if (dist(rng) < dropout_prob) {
       mask_data[idx] = static_cast<T>(0);
     } else {

From ba9f4c787393c57e8f29477e01a3c6b3f43e3fa2 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 22 Mar 2018 20:07:26 +0800
Subject: [PATCH 21/58] fix test_recv_op

---
 python/paddle/fluid/layers/io.py                | 17 ++++++++---------
 .../fluid/tests/unittests/test_recv_op.py       | 17 +++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index bc5e291ad8..bd7e9c30fe 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -113,9 +113,9 @@ class ListenAndServ(object):
     which can receive variables from clients and run a block.
     """
 
-    def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
+    def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
         self.helper = LayerHelper("listen_and_serv")
-        self.inputs = []
+        self.inputs = inputs
         self.outputs = []
         self.endpoint = endpoint
         self.fan_in = fan_in
@@ -160,18 +160,13 @@ class ListenAndServ(object):
         current_block = main_program.current_block()
         parent_block = self.parent_block()
 
-        params, grads = self.get_params_and_grads()
-        param_names = [p.name for p in params]
-        grad_names = [g.name for g in grads]
         parent_block.append_op(
             type='listen_and_serv',
-            inputs={},
+            inputs={"X": self.inputs},
             outputs={},
             attrs={
                 'endpoint': self.endpoint,
                 'Fanin': self.fan_in,
-                'ParamList': param_names,
-                'GradList': grad_names,
                 'OptimizeBlock': current_block
             })
 
@@ -196,10 +191,14 @@ def Send(endpoints, send_vars, get_vars):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
+    rpc_client_var = default_main_program().global_block().create_var(
+        name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
+
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars},
+        outputs={"Out": get_vars,
+                 "RPCClient": rpc_client_var},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
 
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index 985d892c56..f8b7724039 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -32,20 +32,21 @@ class TestRecvOp(unittest.TestCase):
         time.sleep(1)
         self.init_client(place)
         # FIXME(typhoonzero): find a way to gracefully shutdown the server.
-        os.system("kill -9 %d" % p.pid)
+        # os.system("kill -9 %d" % p.pid)
         p.join()
 
     def init_serv(self, place):
         main = fluid.Program()
         with fluid.program_guard(main):
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name="X",
-                append_batch_size=False)
-            fluid.initializer.Constant(value=1.0)(x, main.global_block())
-            serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
+            serv = layers.ListenAndServ(
+                "127.0.0.1:6174", ["X"], optimizer_mode=False)
             with serv.do():
+                x = layers.data(
+                    shape=[32, 32],
+                    dtype='float32',
+                    name="X",
+                    append_batch_size=False)
+                fluid.initializer.Constant(value=1.0)(x, main.global_block())
                 o = layers.scale(x=x, scale=10.0)
             main.global_block().create_var(
                 name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)

From 6cebbd7bcb9d9a88aa482efd38ecfc3a5d4e9fa9 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 22 Mar 2018 20:16:24 +0800
Subject: [PATCH 22/58] update

---
 python/paddle/fluid/tests/unittests/test_recv_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index f8b7724039..854238c627 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -32,7 +32,7 @@ class TestRecvOp(unittest.TestCase):
         time.sleep(1)
         self.init_client(place)
         # FIXME(typhoonzero): find a way to gracefully shutdown the server.
-        # os.system("kill -9 %d" % p.pid)
+        os.system("kill -9 %d" % p.pid)
         p.join()
 
     def init_serv(self, place):

From 14ba67c0ef3bcff13d95788406518bb132fe4a28 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Thu, 22 Mar 2018 08:46:20 -0400
Subject: [PATCH 23/58] Function for running MKLDNN primitive added. Unittest
 added for is_test attribute

---
 paddle/fluid/operators/lrn_mkldnn_op.cc       | 23 +++++++++++--------
 paddle/fluid/operators/lrn_op.cc              |  2 +-
 .../fluid/tests/unittests/test_lrn_op.py      | 19 +++++++++++++++
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index 3bead16ce4..0a18882e81 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -36,6 +36,14 @@ std::shared_ptr<T> insert_to_context(const std::string& key,
 
   return p;
 }
+
+template <typename... Args>
+void run_primitive(Args&&... args) {
+  auto forward_op = mkldnn::lrn_forward{args...};
+
+  std::vector<mkldnn::primitive> pipeline = {forward_op};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
 }  // namespace
 
 template <typename T>
@@ -87,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
                                      static_cast<void*>(output_data)};
 
-    std::unique_ptr<mkldnn::lrn_forward> forward_op = nullptr;
-
     if (!is_test) {
       const std::string key = ctx.op().Output("Out");
       const std::string key_src_memory = key + "@lrn_src_memory";
@@ -108,9 +114,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           key_workspace_memory, dev_ctx,
           forward_pd->workspace_primitive_desc());
 
-      forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory,
-                                               *workspace_memory, dst_memory});
-
+      run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
     } else {
       auto forward_pd =
           mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -119,12 +123,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       auto workspace_memory =
           mkldnn::memory{forward_pd.workspace_primitive_desc()};
 
-      forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory,
-                                               workspace_memory, dst_memory});
+      run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
     }
-
-    std::vector<mkldnn::primitive> pipeline = {*forward_op};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
 };
 
@@ -136,6 +136,9 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                    "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
 
     auto x = ctx.Input<Tensor>("X");
 
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 2b1947a187..b36b5c3a33 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -155,8 +155,8 @@ class LRNOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
 
     ctx->SetOutputDim("Out", x_dim);
-    ctx->SetOutputDim("MidOut", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("MidOut", x_dim);
   }
 
   framework::OpKernelType GetExpectedKernelType(
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 2268eafdbd..8fa480b9bc 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -97,5 +97,24 @@ class TestLRNMKLDNNOp(TestLRNOp):
         self.check_output(atol=0.002)
 
 
+class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+    def get_attrs(self):
+        attrs = TestLRNMKLDNNOp.get_attrs(self)
+        attrs['is_test'] = True
+        return attrs
+
+    def test_check_grad_normal(self):
+        def check_raise_is_test():
+            try:
+                self.check_grad(['X'], 'Out', max_relative_error=0.01)
+            except Exception as e:
+                t = \
+                "is_test attribute should be set to False in training phase."
+                if t in str(e):
+                    raise AttributeError
+
+        self.assertRaises(AttributeError, check_raise_is_test)
+
+
 if __name__ == "__main__":
     unittest.main()

From ac94242ea993948e8e6bb54d961d36794c918864 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 22 Mar 2018 22:55:21 +0800
Subject: [PATCH 24/58] change boost download url to speed up download

---
 cmake/external/boost.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index d9cd264b49..10662fc967 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)

From 76ae540f8ef3dc5463da6127556fc48a343698c9 Mon Sep 17 00:00:00 2001
From: Varun Arora <aroravarun@baidu.com>
Date: Thu, 22 Mar 2018 10:44:43 -0700
Subject: [PATCH 25/58] Move Select to concurrency.py; incorporate outputs
 (#9136)

* Move Select to concurrency.py; incorporate outputs

* CLang formatting for concurrency

* Remove extra bracket - formatting fix - 3

* Comment fix
---
 paddle/fluid/framework/concurrency_test.cc |  10 +-
 paddle/fluid/operators/select_op.cc        |   5 +
 python/paddle/fluid/concurrency.py         | 182 +++++++++++++++++++-
 python/paddle/fluid/layers/control_flow.py | 183 +--------------------
 4 files changed, 192 insertions(+), 188 deletions(-)

diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
index 25152054eb..e98e9d94bf 100644
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
   // Select block
   AddOp("select", {{"X", {dataChanName, quitChanName}},
                    {"case_to_execute", {"caseToExecute"}}},
-        {}, {{"sub_block", casesBlock},
-             {"cases", std::vector<std::string>{case0Config, case1Config}}},
+        {{"Out", {}}},
+        {{"sub_block", casesBlock},
+         {"cases", std::vector<std::string>{case0Config, case1Config}}},
         whileBlock);
 
   scope->Var("stepScopes");
@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {
 
   executor.Run(program, &scope, 0, true, true);
 
-  // After we call executor.run, the Go operator should do a channel_send to set
-  // the
-  // "result" variable to 99
+  // After we call executor.run, the Go operator should do a channel_send to
+  // set the "result" variable to 99.
   auto *finalData = tensor.data<int>();
   EXPECT_EQ(finalData[0], 99);
 }
diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc
index 8344a239df..c0bf0ff927 100644
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
@@ -27,6 +27,7 @@ namespace operators {
 
 static constexpr char kX[] = "X";
 static constexpr char kCaseToExecute[] = "case_to_execute";
+static constexpr char kOutputs[] = "Out";
 
 static constexpr char kCases[] = "cases";
 static constexpr char kCasesBlock[] = "sub_block";
@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Int) The variable the sets the index of the case to execute, "
              "after evaluating the channels being sent to and received from")
         .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the cases of Select Op.")
+        .AsDuplicable();
     AddAttr<std::vector<std::string>>(kCases,
                                       "(String vector) Serialized list of"
                                       "all cases in the select op. Each"
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index 3e4292d235..d65e1a6858 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from layers.control_flow import BlockGuard, Select
+from layers.control_flow import BlockGuard, equal
+from .framework import Operator
 from layer_helper import LayerHelper, unique_name
 from layers import fill_constant
 import core
@@ -75,6 +76,185 @@ class Go(BlockGuard):
             attrs={'sub_block': go_block})
 
 
+class SelectCase(object):
+    DEFAULT = 0
+    SEND = 1
+    RECEIVE = 2
+
+    def __init__(self,
+                 case_idx,
+                 case_to_execute,
+                 channel_action_fn=None,
+                 channel=None,
+                 value=None):
+        self.helper = LayerHelper('conditional_block')
+        self.main_program = self.helper.main_program
+        self.is_scalar_condition = True
+
+        self.case_to_execute = case_to_execute
+        self.idx = case_idx
+
+        # Since we aren't going to use the `channel_send` or `channel_recv`
+        # functions directly, we just need to capture the name.
+        self.action = (self.SEND
+                       if channel_action_fn.__name__ == ('channel_send') else
+                       self.RECEIVE) if channel_action_fn else self.DEFAULT
+        self.value = value
+        self.channel = channel
+
+    def __enter__(self):
+        self.block = self.main_program.create_block()
+
+    def construct_op(self):
+        main_program = self.helper.main_program
+        cases_block = main_program.current_block()
+
+        inner_outputs = set()
+        input_set = set()
+        params = set()
+
+        for op in self.block.ops:
+            # Iterate over all operators, get all the inputs
+            # and add as input to the SelectCase operator.
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        input_set.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        param_list = [
+            cases_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        # Iterate over all operators, get all the outputs
+        # add to the output list of SelectCase operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in cases_block.vars:
+                out_vars.append(cases_block.var(inner_out_name))
+
+        # First, create an op that will determine whether or not this is the
+        # conditional variable to execute.
+        should_execute_block = equal(
+            fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
+            self.case_to_execute)
+
+        step_scope = cases_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        cases_block.append_op(
+            type='conditional_block',
+            inputs={'X': [should_execute_block],
+                    'Params': param_list},
+            outputs={'Out': out_vars,
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': self.block,
+                'is_scalar_condition': self.is_scalar_condition
+            })
+
+        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
+                                if self.channel else '', self.value.name
+                                if self.value else '')
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class Select(BlockGuard):
+    def __init__(self, name=None):
+        self.helper = LayerHelper('select', name=name)
+        self.cases = []
+
+        super(Select, self).__init__(self.helper.main_program)
+        self.case_to_execute = fill_constant(
+            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
+
+    def __enter__(self):
+        super(Select, self).__enter__()
+        return self
+
+    def case(self, channel_action_fn, channel, value):
+        """Create a new block for this condition.
+        """
+        select_case = SelectCase(
+            len(self.cases), self.case_to_execute, channel_action_fn, channel,
+            value)
+
+        self.cases.append(select_case)
+
+        return select_case
+
+    def default(self):
+        """Create a default case block for this condition.
+        """
+        default_case = SelectCase(len(self.cases), self.case_to_execute)
+
+        self.cases.append(default_case)
+
+        return default_case
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+
+        # Create a select op and another block to wrap its
+        # case blocks.
+        select_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(select_block.parent_idx)
+
+        # Construct each case op, inside the newly created select block.
+        serialized_cases = []
+        for case in self.cases:
+            serialized_cases.append(case.construct_op())
+
+        intermediate = set()
+        params = set()
+
+        for case_block in select_block.ops:
+            if case_block.attrs and 'sub_block' in case_block.attrs:
+                for each_op in case_block.attrs['sub_block'].ops:
+                    assert isinstance(each_op, Operator)
+                    for iname in each_op.input_names:
+                        for in_var_name in each_op.input(iname):
+                            if in_var_name not in intermediate:
+                                params.add(in_var_name)
+
+                    for oname in each_op.output_names:
+                        for out_var_name in each_op.output(oname):
+                            intermediate.add(out_var_name)
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name in intermediate
+        ]
+
+        X = [select_block.var_recursive(x_name) for x_name in params]
+
+        # Needs to be used by `equal` inside the cases block.
+        X.append(self.case_to_execute)
+
+        # Construct the select op.
+        parent_block.append_op(
+            type='select',
+            inputs={'X': X,
+                    'case_to_execute': self.case_to_execute},
+            attrs={'sub_block': select_block,
+                   'cases': serialized_cases},
+            outputs={'Out': out_list})
+
+        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
+
+
 def make_channel(dtype, capacity=0):
     """
     Helps implementation of a concurrent program by creating a "channel" of
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 02cd0a05a1..1bb1aa30ee 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -16,7 +16,7 @@ import contextlib
 from layer_function_generator import autodoc
 from tensor import assign, fill_constant
 from .. import core
-from ..framework import Program, Variable, Operator, Block
+from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ops import logical_and, logical_not, logical_or
 
@@ -29,7 +29,6 @@ __all__ = [
     'WhileGuard',
     'While',
     'Switch',
-    'Select',
     'lod_rank_table',
     'max_sequence_len',
     'topk',
@@ -1212,186 +1211,6 @@ class Switch(object):
         return True
 
 
-class SelectCase(object):
-    DEFAULT = 0
-    SEND = 1
-    RECEIVE = 2
-
-    def __init__(self,
-                 case_idx,
-                 case_to_execute,
-                 channel_action_fn=None,
-                 channel=None,
-                 value=None):
-        self.helper = LayerHelper('conditional_block')
-        self.main_program = self.helper.main_program
-        self.is_scalar_condition = True
-
-        self.case_to_execute = case_to_execute
-        self.idx = case_idx
-
-        # Since we aren't going to use the `channel_send` or `channel_recv`
-        # functions directly, we just need to capture the name.
-        self.action = (self.SEND
-                       if channel_action_fn.__name__ == ('channel_send') else
-                       self.RECEIVE) if channel_action_fn else (self.DEFAULT)
-        self.value = value
-        self.channel = channel
-
-    def __enter__(self):
-        self.block = self.main_program.create_block()
-
-    def construct_op(self):
-        main_program = self.helper.main_program
-        cases_block = main_program.current_block()
-
-        inner_outputs = set()
-        input_set = set()
-        params = set()
-
-        for op in self.block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the SelectCase operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        input_set.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        param_list = [
-            cases_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        # Iterate over all operators, get all the outputs
-        # add to the output list of SelectCase operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in cases_block.vars:
-                out_vars.append(cases_block.var(inner_out_name))
-
-        # First, create an op that will determine whether or not this is the
-        # conditional variable to execute.
-        should_execute_block = equal(
-            fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
-            self.case_to_execute)
-
-        step_scope = cases_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        cases_block.append_op(
-            type='conditional_block',
-            inputs={'X': [should_execute_block],
-                    'Params': param_list},
-            outputs={'Out': out_vars,
-                     'Scope': [step_scope]},
-            attrs={
-                'sub_block': self.block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
-
-        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
-                                if self.channel else '', self.value.name
-                                if self.value else '')
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class Select(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper('select', name=name)
-        self.cases = []
-
-        super(Select, self).__init__(self.helper.main_program)
-        self.case_to_execute = fill_constant(
-            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
-
-    def __enter__(self):
-        super(Select, self).__enter__()
-        return self
-
-    def case(self, channel_action_fn, channel, value):
-        """Create a new block for this condition.
-        """
-        select_case = SelectCase(
-            len(self.cases), self.case_to_execute, channel_action_fn, channel,
-            value)
-
-        self.cases.append(select_case)
-
-        return select_case
-
-    def default(self):
-        """Create a default case block for this condition.
-        """
-        default_case = SelectCase(len(self.cases), self.case_to_execute)
-
-        self.cases.append(default_case)
-
-        return default_case
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-
-        # Create a select op and another block to wrap its
-        # case blocks.
-        select_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(select_block.parent_idx)
-
-        # Construct each case op, inside the newly created select block.
-        serialized_cases = []
-        for case in self.cases:
-            serialized_cases.append(case.construct_op())
-
-        intermediate = set()
-        params = set()
-
-        for case_block in select_block.ops:
-            if case_block.attrs and 'sub_block' in case_block.attrs:
-                for each_op in case_block.attrs['sub_block'].ops:
-                    assert isinstance(each_op, Operator)
-                    for iname in each_op.input_names:
-                        for in_var_name in each_op.input(iname):
-                            if in_var_name not in intermediate:
-                                params.add(in_var_name)
-
-                    for oname in each_op.output_names:
-                        for out_var_name in each_op.output(oname):
-                            intermediate.add(out_var_name)
-
-        # TODO(varunarora): Figure out if defining output is needed.
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
-
-        X = [select_block.var_recursive(x_name) for x_name in params]
-
-        # Needs to be used by `equal` inside the cases block.
-        X.append(self.case_to_execute)
-
-        # Construct the select op.
-        parent_block.append_op(
-            type='select',
-            inputs={'X': X,
-                    'case_to_execute': self.case_to_execute},
-            attrs={'sub_block': select_block,
-                   'cases': serialized_cases},
-            outputs={})
-
-        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
-
-
 class IfElseBlockGuard(object):
     def __init__(self, is_true, ifelse):
         if not isinstance(ifelse, IfElse):

From a9a228ad8dc30e2341e0e64b6cb053dc116578e6 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Fri, 23 Mar 2018 18:40:22 +0800
Subject: [PATCH 26/58] fix dist compile

---
 paddle/fluid/operators/detail/grpc_server.h  |  2 ++
 paddle/fluid/operators/detail/test_serde.cc  | 10 ++++-----
 paddle/fluid/operators/listen_and_serv_op.cc | 22 +++++++-------------
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index f891c75dbc..787e1506e2 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_service.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc
index 99c1577223..e646c894d1 100644
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -199,9 +199,9 @@ TEST(LodTensor, Run) {
   RunTestLodTensor(place);
   RunTestLodTensor(place, 1);
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+  RunTestLodTensor(gpu, 1);
 #endif
 }
 
@@ -210,7 +210,7 @@ TEST(SelectedRows, Run) {
   RunSerdeTestSelectedRows(place);
 
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace place;
-  RunSerdeTestSelectedRows(place);
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
 #endif
 }
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index d8a3c45ac5..9c788108e2 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -93,12 +93,6 @@ class ListenAndServOp : public framework::OperatorBase {
                       "server program should have at least 2 blocks");
 
     framework::Executor executor(dev_place);
-    std::vector<framework::ExecutorPrepareContext *> blk_ctx_list;
-    blk_ctx_list.push_back(nullptr);  // block0 is not used.
-    for (int blkid = 1; blkid < num_blocks; ++blkid) {
-      auto *exe_ctx = executor.Prepare(*program, blkid);
-      blk_ctx_list.push_back(exe_ctx);
-    }
 
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
@@ -150,11 +144,11 @@ class ListenAndServOp : public framework::OperatorBase {
       // block0 contains only listen_and_serv op, start run from block1.
       for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
         fs.push_back(framework::Async(
-            [&executor, &program, &recv_scope, &blk_ctx_list, blkid]() {
+            [&executor, &program, &recv_scope, blkid]() {
               int run_block = blkid;  // thread local
               try {
-                executor.RunPreparedContext(blk_ctx_list[run_block],
-                                            &recv_scope, false, false);
+                executor.Run(*program, &recv_scope, run_block,
+                             false, false);
               } catch (std::exception &e) {
                 LOG(ERROR) << "run sub program error " << e.what();
               }
@@ -164,8 +158,8 @@ class ListenAndServOp : public framework::OperatorBase {
       // Run global block at final step, or block1 if there are only 2 blocks
       if (num_blocks >= 2) {
         try {
-          executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope,
-                                      false, false);
+          executor.Run(*program, &recv_scope, num_blocks - 1,
+                       false, false);
         } catch (std::exception &e) {
           LOG(ERROR) << "run sub program error " << e.what();
         }
@@ -185,9 +179,9 @@ class ListenAndServOp : public framework::OperatorBase {
       sparse_vars.clear();
     }  // while(true)
 
-    for (int i = 0; i < num_blocks; ++i) {
-      delete blk_ctx_list[i];
-    }
+    // for (int i = 0; i < num_blocks; ++i) {
+    //   delete blk_ctx_list[i];
+    // }
   }
 
  protected:

From bb815d4364eaaf6c4053fc6c2259ebfa559bca90 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Fri, 23 Mar 2018 19:13:25 +0800
Subject: [PATCH 27/58] update

---
 .clang_format.hook                           |  2 +-
 paddle/fluid/operators/detail/grpc_server.h  |  3 +--
 paddle/fluid/operators/listen_and_serv_op.cc | 10 ++++------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/.clang_format.hook b/.clang_format.hook
index 1d92821686..edec286b77 100755
--- a/.clang_format.hook
+++ b/.clang_format.hook
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-readonly VERSION="3.8"
+readonly VERSION="7.0"
 
 version=$(clang-format -version)
 
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 787e1506e2..10e6dd45a9 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -22,11 +22,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/grpc_service.h"
-#include "paddle/fluid/operators/detail/grpc_service.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 9c788108e2..08b83375dd 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -143,12 +143,11 @@ class ListenAndServOp : public framework::OperatorBase {
       std::vector<std::future<void>> fs;
       // block0 contains only listen_and_serv op, start run from block1.
       for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
-        fs.push_back(framework::Async(
-            [&executor, &program, &recv_scope, blkid]() {
+        fs.push_back(
+            framework::Async([&executor, &program, &recv_scope, blkid]() {
               int run_block = blkid;  // thread local
               try {
-                executor.Run(*program, &recv_scope, run_block,
-                             false, false);
+                executor.Run(*program, &recv_scope, run_block, false, false);
               } catch (std::exception &e) {
                 LOG(ERROR) << "run sub program error " << e.what();
               }
@@ -158,8 +157,7 @@ class ListenAndServOp : public framework::OperatorBase {
       // Run global block at final step, or block1 if there are only 2 blocks
       if (num_blocks >= 2) {
         try {
-          executor.Run(*program, &recv_scope, num_blocks - 1,
-                       false, false);
+          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
         } catch (std::exception &e) {
           LOG(ERROR) << "run sub program error " << e.what();
         }

From bf66ce04940477375d8d605dcd8ece45ae2a4b61 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Fri, 23 Mar 2018 19:15:05 +0800
Subject: [PATCH 28/58] update

---
 .clang_format.hook | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.clang_format.hook b/.clang_format.hook
index edec286b77..1d92821686 100755
--- a/.clang_format.hook
+++ b/.clang_format.hook
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-readonly VERSION="7.0"
+readonly VERSION="3.8"
 
 version=$(clang-format -version)
 

From 043f47b27fa827cd87df93027124dce6d1d22d7e Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 23 Mar 2018 18:29:15 +0800
Subject: [PATCH 29/58] fix concat op

---
 paddle/fluid/operators/math/concat.cu | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 60b266f08f..aede380006 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -70,9 +70,8 @@ __global__ void KernelConcat(T** inputs, const int input_col,
                              const int output_rows, const int output_cols,
                              T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
   for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
+    int split = tid_x * 1.0 / input_col;
     int in_offset = tid_x - split * input_col;
     T* input_ptr = inputs[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -110,17 +109,16 @@ __global__ void KernelConcatGrad(const T* input, const int input_row,
 
 template <typename T>
 __global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int output_cols,
+                                 const int input_col, const int output_col,
                                  T** outputs) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
   for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
+    int split = tid_x / output_col;
+    int in_offset = tid_x - split * output_col;
     T* output_ptr = outputs[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * output_cols + in_offset] =
+      output_ptr[tid_y * output_col + in_offset] =
           input[tid_y * input_col + tid_x];
   }
 }

From 9075049a2921051f1ae3d685adcd562c76f4f247 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 23 Mar 2018 20:32:48 +0800
Subject: [PATCH 30/58] add unit test

---
 .../fluid/tests/unittests/test_concat_op.py   | 30 ++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 558f3a4dcb..1e00d67d54 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -20,19 +20,35 @@ from op_test import OpTest
 class TestConcatOp(OpTest):
     def setUp(self):
         self.op_type = "concat"
-        x0 = np.random.random((2, 1, 4, 5)).astype('float32')
-        x1 = np.random.random((2, 2, 4, 5)).astype('float32')
-        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        axis = 1
-        self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]}
-        self.attrs = {'axis': axis}
-        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.axis)
+        }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
         self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
+
+
+class TestConcatOp2(OpTest):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
 
 
 if __name__ == '__main__':

From 750aff10cebd03c3a52bec28508cc5a6195ef937 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 23 Mar 2018 21:00:24 +0800
Subject: [PATCH 31/58] code refine

---
 paddle/fluid/operators/math/concat.cu | 148 +++++++++++++-------------
 1 file changed, 74 insertions(+), 74 deletions(-)

diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index aede380006..1b637446a0 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -66,60 +66,60 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }
 
 template <typename T>
-__global__ void KernelConcat(T** inputs, const int input_col,
-                             const int output_rows, const int output_cols,
-                             T* output) {
+__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
+                             const int out_rows, const int out_cols,
+                             T* output_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * 1.0 / input_col;
-    int in_offset = tid_x - split * input_col;
-    T* input_ptr = inputs[split];
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
+    int in_offset = tid_x - split * fixed_in_col;
+    T* input_ptr = inputs_data[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * input_col + in_offset];
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
+      output_data[tid_y * out_cols + tid_x] =
+          input_ptr[tid_y * fixed_in_col + in_offset];
     }
   }
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int* output_cols,
-                                 int col_size, T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int* out_cols,
+                                 int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
-  int curr_offset = output_cols[segment];
+  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
+  int curr_offset = out_cols[segment];
   int curr_segment = segment;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
     T curr_col_offset;
-    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs[curr_segment];
+    T* output_ptr = outputs_data[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
       output_ptr[tid_y * segment_width + local_col] =
-          input[tid_y * input_col + tid_x];
+          input_data[tid_y * in_col + tid_x];
   }
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int output_col,
-                                 T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int fixed_out_col,
+                                 T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x / output_col;
-    int in_offset = tid_x - split * output_col;
-    T* output_ptr = outputs[split];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * output_col + in_offset] =
-          input[tid_y * input_col + tid_x];
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * fixed_out_col + in_offset] =
+          input_data[tid_y * in_col + tid_x];
   }
 }
 
@@ -134,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<framework::Tensor>& input, const int axis,
                   framework::Tensor* output) {
     // TODO(zcd): Add input data validity checking
-    int num = input.size();
-    int rows = 1;
+    int in_num = input.size();
+    int in_row = 1;
     auto dim_0 = input[0].dims();
     for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
+      in_row *= dim_0[i];
     }
-    int cols = input[0].numel() / rows;
-    int out_rows = rows, out_cols = 0;
+    int in_col = input[0].numel() / in_row;
+    int out_row = in_row, out_col = 0;
 
-    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_cols(num + 1);
-    inputs_cols[0] = 0;
+    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_col(in_num + 1);
     T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
 
+    inputs_col[0] = 0;
     bool sameShape = true;
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
+    for (int i = 0; i < in_num; ++i) {
+      int t_cols = input[i].numel() / in_row;
       if (sameShape) {
-        if (t_cols != cols) sameShape = false;
+        if (t_cols != in_col) sameShape = false;
       }
-      out_cols += t_cols;
-      inputs_cols[i + 1] = out_cols;
+      out_col += t_cols;
+      inputs_col[i + 1] = out_col;
       inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
     }
 
-    T** ins_gpu =
+    T** dev_ins_data =
         reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
 
     // computation
     // set the thread block and grid according to CurrentDeviceId
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
-    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((out_cols + 31) >> 5) << 5;
+    if (out_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_col + 31) >> 5) << 5;
     }
     int block_rows = kThreadsPerBlock / block_cols;
     dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -177,18 +176,19 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
     int grid_cols =
-        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+        std::min((out_col + block_cols - 1) / block_cols, max_blocks);
     int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+          dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
+      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
-          out_cols, output->data<T>());
+          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
+          out_row, out_col, output->data<T>());
     }
   }
 };
@@ -204,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input, const int axis,
                   std::vector<framework::Tensor>& outputs) {
     // TODO(zcd): Add input data validity checking
-    int num = outputs.size();
-    int input_row = 1;
+    int o_num = outputs.size();
+    int out_row = 1;
     auto dim_0 = outputs[0].dims();
     for (int i = 0; i < axis; ++i) {
-      input_row *= dim_0[i];
+      out_row *= dim_0[i];
     }
 
-    int output_col_0 = outputs[0].numel() / input_row;
-    int input_col = 0;
+    int out_col = outputs[0].numel() / out_row;
+    int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
-    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(num + 1);
-    outputs_cols[0] = 0;
+    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(o_num + 1);
     T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
 
-    for (int i = 0; i < num; ++i) {
-      int t_col = outputs[i].numel() / input_row;
+    outputs_cols[0] = 0;
+    for (int i = 0; i < o_num; ++i) {
+      int t_col = outputs[i].numel() / out_row;
       if (sameShape) {
-        if (t_col != output_col_0) sameShape = false;
+        if (t_col != out_col) sameShape = false;
       }
-      input_col += t_col;
-      outputs_cols[i + 1] = input_col;
+      in_col += t_col;
+      outputs_cols[i + 1] = in_col;
       outputs_ptr[i] = outputs[i].data<T>();
     }
 
-    T** outs_gpu =
+    T** dev_out_gpu_data =
         reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
 
     // computation
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
-    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((input_col + 31) >> 5) << 5;
+    if (in_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((in_col + 31) >> 5) << 5;
     }
     int block_rows = kThreadsPerBlock / block_cols;
     dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -247,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
     int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
     int grid_cols =
-        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+        std::min((in_col + block_cols - 1) / block_cols, max_blocks);
     int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
     } else {
+      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, outs_col_gpu,
-          static_cast<int>(outputs_cols.size()), outs_gpu);
+          input.data<T>(), in_row, in_col, dev_outs_col_data,
+          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
   }
 };

From 4466f0bec8c23558536959d06b45a1b4c2daab70 Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Wed, 14 Mar 2018 16:10:54 +0100
Subject: [PATCH 32/58] MKLDNN Relu Tanh Sqrt Abs activations added

---
 paddle/fluid/framework/operator.h             |   8 +
 paddle/fluid/operators/CMakeLists.txt         |   5 +
 .../fluid/operators/activation_mkldnn_op.cc   | 192 ++++++++++++++++++
 paddle/fluid/operators/activation_op.cc       |  52 ++++-
 paddle/fluid/operators/activation_op.h        |  65 +++++-
 paddle/fluid/platform/mkldnn_helper.h         |   1 +
 python/paddle/fluid/layer_helper.py           |   2 +
 .../paddle/fluid/tests/unittests/op_test.py   |  12 +-
 .../tests/unittests/test_activation_op.py     |  67 ++++++
 9 files changed, 401 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/activation_mkldnn_op.cc

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 41214b41cb..d354714d0e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -84,6 +84,10 @@ class OperatorBase {
     return boost::get<T>(attrs_.at(name));
   }
 
+  inline bool HasAttr(const std::string& name) const {
+    return attrs_.count(name) != 0;
+  }
+
   /// if scope is not null, also show dimensions of arguments
   virtual std::string DebugStringEx(const Scope* scope) const;
 
@@ -195,6 +199,10 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
+  inline bool HasAttr(const std::string& name) const {
+    return op_.HasAttr(name);
+  }
+
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();
   }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index c0245379ac..9c367dd145 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -153,7 +153,12 @@ function(op_library TARGET)
 
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
     endif()
 
     # pybind USE_OP
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
new file mode 100644
index 0000000000..65cf2fceb7
--- /dev/null
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+namespace {
+template <typename T, typename ExecContext>
+void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                     const T alpha = 0, const T beta = 0) {
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                 "It must use CPUPlace.");
+
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  // get buffers
+  const auto *src = ctx.template Input<Tensor>("X");
+  const auto *src_data = src->template data<T>();
+
+  auto *dst = ctx.template Output<Tensor>("Out");
+  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+
+  // get memory dim
+  PADDLE_ENFORCE(src->dims().size() == 4,
+                 "Input dim must be with 4, i.e. NCHW");
+  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+
+  // create memory description
+  // TODO(kbinias-intel): support more formats
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
+  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+
+  auto forward_desc = mkldnn::eltwise_forward::desc(
+      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
+
+  // save prim desc into global device context to be referred in backward path
+  const std::string key = ctx.op().Output("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+      forward_desc, mkldnn_engine);
+  dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
+
+  auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
+
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+template <typename T, typename ExecContext>
+void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                  const T alpha = 0, const T beta = 0) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  // get buffers
+  const auto *x = ctx.template Input<Tensor>("X");
+  const auto *src = x->template data<T>();
+
+  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+  const auto *diff_dst = dout->template data<T>();
+
+  auto *dx =
+      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+
+  // get memory dim
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+
+  // create memory description
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto diff_src_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+  auto diff_dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+
+  auto backward_desc =
+      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
+
+  // retrieve eltwise primitive desc from device context
+  const std::string key = ctx.op().Input("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
+  PADDLE_ENFORCE(forward_pd != nullptr,
+                 "Fail to find eltwise_pd in device context");
+  auto *p_forward_pd =
+      static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
+
+  auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+      backward_desc, mkldnn_engine, *p_forward_pd);
+
+  auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
+                                              diff_dst_memory, diff_src_memory);
+
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+}  // anonymous namespace
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_forward<T>(ctx, algorithm);
+  }
+};
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_grad<T>(ctx, algorithm);
+  }
+};
+
+template <typename T>
+using ReluMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
+
+template <typename T>
+using ReluMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,       \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);    \
+  REGISTER_OP_KERNEL(                                                      \
+      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+
+#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                   \
+  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor)         \
+      __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor)     \
+          __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor) \
+              __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+
+FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index ec637658c0..ae9ca9d4ff 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,11 @@ class ActivationOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return ActivationHelper().GetKernelType(ctx, *this);
+  }
 };
 
 class ActivationOpGrad : public framework::OperatorWithKernel {
@@ -34,6 +39,11 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return ActivationHelper().GetKernelType(ctx, *this);
+  }
 };
 
 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,6 +97,16 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu operator");
     AddOutput("Out", "Output of Relu operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Relu Activation Operator.
 
@@ -140,6 +160,16 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Tanh operator");
     AddOutput("Out", "Output of Tanh operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Tanh Activation Operator.
 
@@ -193,6 +223,16 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sqrt operator");
     AddOutput("Out", "Output of Sqrt operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Sqrt Activation Operator.
 
@@ -208,6 +248,16 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Abs operator");
     AddOutput("Out", "Output of Abs operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Abs Activation Operator.
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index b95e793586..084b6bace7 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,9 +17,36 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
+class ActivationHelper {
+ public:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx,
+      const framework::OperatorWithKernel& oper) const {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+    }
+#endif
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    if (ctx.HasAttr("data_format")) {
+      std::string data_format = ctx.Attr<std::string>("data_format");
+      layout = framework::StringToDataLayout(data_format);
+    }
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace(), layout, library);
+  }
+};
+
 template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -49,6 +76,27 @@ class ActivationKernel
   }
 };
 
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(!context.HasAttr("X"),
+                   "Cannot find input tensor X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(!context.HasAttr("Out"),
+                   "Cannot find output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
 template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -77,6 +125,21 @@ class ActivationGradKernel
   }
 };
 
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
 template <typename T>
 struct BaseActivationFunctor {
   using ELEMENT_TYPE = T;
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 90b78142b8..281d38cb8a 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -42,6 +42,7 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
 }
 
 inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
+  if (!ctx.HasAttr("use_mkldnn")) return false;
   bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
   return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
 }
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 58b6682271..d771837fc5 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -403,6 +403,8 @@ class LayerHelper(object):
         if 'use_mkldnn' in self.kwargs:
             act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         act_type = act.pop('type')
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b..2b10f16688 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -215,7 +215,8 @@ class OpTest(unittest.TestCase):
         '''Fix random seeds to remove randomness from tests'''
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
-
+        cls.use_mkldnn = False
+        cls.data_format = 'AnyLayout'
         np.random.seed(123)
         random.seed(124)
 
@@ -340,7 +341,14 @@ class OpTest(unittest.TestCase):
                                          "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
+    def fill_attrs(self):
+        attrs = self.attrs if hasattr(self, "attrs") else dict()
+        attrs["use_mkldnn"] = self.use_mkldnn
+        attrs["data_format"] = self.data_format
+        return attrs
+
     def check_output(self, atol=1e-5):
+        self.attrs = self.fill_attrs()
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -348,6 +356,7 @@ class OpTest(unittest.TestCase):
             self.check_output_with_place(place, atol)
 
     def check_output_customized(self, checker):
+        self.attrs = self.fill_attrs()
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -383,6 +392,7 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
+        self.attrs = self.fill_attrs()
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 1e3decfbaf..c6c86a5969 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -506,5 +506,72 @@ class TestSwish(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
+#--------------------test MKLDNN--------------------
+class TestMKLDNNRelu(OpTest):
+    def setUp(self):
+        self.op_type = "relu"
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
+        self.use_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestMKLDNNTanh(OpTest):
+    def setUp(self):
+        self.op_type = "tanh"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.use_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestMKLDNNSqrt(OpTest):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.use_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestMKLDNNAbs(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.use_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 if __name__ == "__main__":
     unittest.main()

From a64b312e3a922ea1e0520d59950e81189748c7f4 Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Tue, 20 Mar 2018 11:22:12 +0100
Subject: [PATCH 33/58] Correcting for PR comments

---
 paddle/fluid/framework/operator.h             |  8 ---
 .../fluid/operators/activation_mkldnn_op.cc   | 11 ++--
 paddle/fluid/operators/activation_op.cc       | 28 --------
 paddle/fluid/operators/activation_op.h        | 40 ------------
 paddle/fluid/operators/mkldnn_activation_op.h | 64 +++++++++++++++++++
 paddle/fluid/platform/mkldnn_helper.h         |  1 -
 .../paddle/fluid/tests/unittests/op_test.py   | 12 +---
 .../tests/unittests/test_activation_op.py     |  8 +--
 8 files changed, 75 insertions(+), 97 deletions(-)
 create mode 100644 paddle/fluid/operators/mkldnn_activation_op.h

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index d354714d0e..41214b41cb 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -84,10 +84,6 @@ class OperatorBase {
     return boost::get<T>(attrs_.at(name));
   }
 
-  inline bool HasAttr(const std::string& name) const {
-    return attrs_.count(name) != 0;
-  }
-
   /// if scope is not null, also show dimensions of arguments
   virtual std::string DebugStringEx(const Scope* scope) const;
 
@@ -199,10 +195,6 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
-  inline bool HasAttr(const std::string& name) const {
-    return op_.HasAttr(name);
-  }
-
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();
   }
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index 65cf2fceb7..6ff363d766 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "mkldnn.hpp"
+#include "mkldnn_activation_op.h"
 #include "paddle/fluid/operators/activation_op.h"
 
 namespace paddle {
@@ -183,10 +184,10 @@ namespace ops = paddle::operators;
       act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
       ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
 
-#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                   \
-  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor)         \
-      __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor)     \
-          __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor) \
-              __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
+  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
+  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
+  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
+  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index ae9ca9d4ff..043ffb01fc 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -100,13 +100,6 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Relu Activation Operator.
 
@@ -163,13 +156,6 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Tanh Activation Operator.
 
@@ -226,13 +212,6 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Sqrt Activation Operator.
 
@@ -251,13 +230,6 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Abs Activation Operator.
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 084b6bace7..e607a5554f 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -37,10 +37,6 @@ class ActivationHelper {
     }
 #endif
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-    if (ctx.HasAttr("data_format")) {
-      std::string data_format = ctx.Attr<std::string>("data_format");
-      layout = framework::StringToDataLayout(data_format);
-    }
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
         ctx.GetPlace(), layout, library);
@@ -76,27 +72,6 @@ class ActivationKernel
   }
 };
 
-template <typename Functor>
-class MKLDNNActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(!context.HasAttr("X"),
-                   "Cannot find input tensor X, variable name = %s",
-                   context.op().Input("X"));
-    PADDLE_ENFORCE(!context.HasAttr("Out"),
-                   "Cannot find output tensor Out, variable name = %s",
-                   context.op().Output("Out"));
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(context);
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -125,21 +100,6 @@ class ActivationGradKernel
   }
 };
 
-template <typename Functor>
-class MKLDNNActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(context);
-  }
-};
-
 template <typename T>
 struct BaseActivationFunctor {
   using ELEMENT_TYPE = T;
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
new file mode 100644
index 0000000000..976e362911
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
+                   "Cannot get input tensor X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
+                   "Cannot find output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 281d38cb8a..90b78142b8 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -42,7 +42,6 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
 }
 
 inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
-  if (!ctx.HasAttr("use_mkldnn")) return false;
   bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
   return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
 }
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 2b10f16688..8393f7827b 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -215,8 +215,7 @@ class OpTest(unittest.TestCase):
         '''Fix random seeds to remove randomness from tests'''
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
-        cls.use_mkldnn = False
-        cls.data_format = 'AnyLayout'
+
         np.random.seed(123)
         random.seed(124)
 
@@ -341,14 +340,7 @@ class OpTest(unittest.TestCase):
                                          "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
-    def fill_attrs(self):
-        attrs = self.attrs if hasattr(self, "attrs") else dict()
-        attrs["use_mkldnn"] = self.use_mkldnn
-        attrs["data_format"] = self.data_format
-        return attrs
-
     def check_output(self, atol=1e-5):
-        self.attrs = self.fill_attrs()
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -356,7 +348,6 @@ class OpTest(unittest.TestCase):
             self.check_output_with_place(place, atol)
 
     def check_output_customized(self, checker):
-        self.attrs = self.fill_attrs()
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -392,7 +383,6 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
-        self.attrs = self.fill_attrs()
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index c6c86a5969..1d53737ac1 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -515,7 +515,7 @@ class TestMKLDNNRelu(OpTest):
         x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
         self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
-        self.use_mkldnn = True
+        self.attrs = {"use_mkldnn": True}
 
     def test_check_output(self):
         self.check_output()
@@ -531,7 +531,7 @@ class TestMKLDNNTanh(OpTest):
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
         }
         self.outputs = {'Out': np.tanh(self.inputs['X'])}
-        self.use_mkldnn = True
+        self.attrs = {"use_mkldnn": True}
 
     def test_check_output(self):
         self.check_output()
@@ -547,7 +547,7 @@ class TestMKLDNNSqrt(OpTest):
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
         }
         self.outputs = {'Out': np.sqrt(self.inputs['X'])}
-        self.use_mkldnn = True
+        self.attrs = {"use_mkldnn": True}
 
     def test_check_output(self):
         self.check_output()
@@ -564,7 +564,7 @@ class TestMKLDNNAbs(OpTest):
         x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
         self.outputs = {'Out': np.abs(self.inputs['X'])}
-        self.use_mkldnn = True
+        self.attrs = {"use_mkldnn": True}
 
     def test_check_output(self):
         self.check_output()

From d8bd436fc16497e1f29de2b1f4c2d6f59abb80de Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Wed, 21 Mar 2018 15:48:26 +0100
Subject: [PATCH 34/58] Fixed tests

---
 paddle/fluid/operators/activation_op.cc       | 27 ++++-------
 paddle/fluid/operators/activation_op.h        | 19 --------
 paddle/fluid/operators/mkldnn_activation_op.h | 47 +++++++++++++++++++
 3 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 043ffb01fc..979115eee0 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"
 
 namespace paddle {
 namespace operators {
@@ -25,11 +26,6 @@ class ActivationOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return ActivationHelper().GetKernelType(ctx, *this);
-  }
 };
 
 class ActivationOpGrad : public framework::OperatorWithKernel {
@@ -39,11 +35,6 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return ActivationHelper().GetKernelType(ctx, *this);
-  }
 };
 
 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -546,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
             tanh_shrink_grad, ops::ActivationOpGrad);
@@ -558,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
 REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
             softshrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
 REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
             ops::ActivationOpGrad);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index e607a5554f..4c575b4a7b 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -24,25 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class ActivationHelper {
- public:
-  framework::OpKernelType GetKernelType(
-      const framework::ExecutionContext& ctx,
-      const framework::OperatorWithKernel& oper) const {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-    }
-#endif
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.GetPlace(), layout, library);
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
index 976e362911..083d03ebe6 100644
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -60,5 +60,52 @@ class MKLDNNActivationGradKernel
   }
 };
 
+namespace {
+framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel& oper) {
+  framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+  }
+#endif
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      ctx.GetPlace(), layout, library);
+}
+}  // anonymous namespace
+
+class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+
+class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

From 6461e800a5404762e6105a4080625bee64b1c2b0 Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Thu, 22 Mar 2018 15:47:02 +0100
Subject: [PATCH 35/58] Inheritance added for MKLDNN tests

---
 .../tests/unittests/test_activation_op.py     | 50 ++++++-------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 1d53737ac1..4a2b35322d 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -507,58 +507,46 @@ class TestSwish(OpTest):
 
 
 #--------------------test MKLDNN--------------------
-class TestMKLDNNRelu(OpTest):
+class TestMKLDNNRelu(TestRelu):
     def setUp(self):
-        self.op_type = "relu"
+        super(TestMKLDNNRelu, self).setUp()
+
         x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
         # The same reason with TestAbs
         x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
-        self.attrs = {"use_mkldnn": True}
-
-    def test_check_output(self):
-        self.check_output()
+        out = np.maximum(x, 0)
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
 
 
-class TestMKLDNNTanh(OpTest):
+class TestMKLDNNTanh(TestTanh):
     def setUp(self):
-        self.op_type = "tanh"
+        super(TestMKLDNNTanh, self).setUp()
+
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
         }
         self.outputs = {'Out': np.tanh(self.inputs['X'])}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
-
-class TestMKLDNNSqrt(OpTest):
+class TestMKLDNNSqrt(TestSqrt):
     def setUp(self):
-        self.op_type = "sqrt"
+        super(TestMKLDNNSqrt, self).setUp()
+
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
         }
         self.outputs = {'Out': np.sqrt(self.inputs['X'])}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
 
-class TestMKLDNNAbs(OpTest):
+class TestMKLDNNAbs(TestAbs):
     def setUp(self):
-        self.op_type = "abs"
+        super(TestMKLDNNAbs, self).setUp()
+
         x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
         # The same reason with TestAbs
         x[np.abs(x) < 0.005] = 0.02
@@ -566,12 +554,6 @@ class TestMKLDNNAbs(OpTest):
         self.outputs = {'Out': np.abs(self.inputs['X'])}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
 
 if __name__ == "__main__":
     unittest.main()

From 30c750ebb99cd5fda477457679f3b3b39fd04f84 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <aroraabhinav@baidu.com>
Date: Fri, 23 Mar 2018 10:27:36 -0700
Subject: [PATCH 36/58] Fix links to english docs

---
 doc/v2/howto/index_en.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst
index bf2320a169..35ef197f58 100644
--- a/doc/v2/howto/index_en.rst
+++ b/doc/v2/howto/index_en.rst
@@ -6,32 +6,32 @@ PaddlePaddle provides the users the ability to flexibly set various command line
 ..  toctree::
   :maxdepth: 1
 
-  cmd_parameter/index_cn.rst
+  cmd_parameter/index_en.rst
 
 PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
 
 ..  toctree::
   :maxdepth: 1
 
-  cluster/index_cn.rst
+  cluster/index_en.rst
 
 PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
 
 ..  toctree::
   :maxdepth: 1
 
-  capi/index_cn.rst
+  capi/index_en.rst
 
 PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
 
 ..  toctree::
   :maxdepth: 1
 
-  rnn/index_cn.rst
+  rnn/index_en.rst
 
 How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
 
 ..  toctree::
   :maxdepth: 1
 
-  optimization/gpu_profiling_cn.rst
+  optimization/gpu_profiling_en.rst

From 8090eb627273d88aad55966755c138dcde2feb93 Mon Sep 17 00:00:00 2001
From: Darcy <putcncx@gmail.com>
Date: Sat, 24 Mar 2018 02:51:45 -0700
Subject: [PATCH 37/58] added proto_desc to device_tracer's dep list (#9342)

---
 paddle/fluid/platform/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 7eec6ab657..686c088914 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 

From cffe1a91124b2b8aa45463ddbe8445c23023ece3 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sat, 24 Mar 2018 22:55:28 +0800
Subject: [PATCH 38/58] Profiler can get elapsed time of `sendop` (#9345)

---
 paddle/fluid/operators/send_op.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index a77c38f633..fdf3c06ef0 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
     auto client_var_name = Output("RPCClient");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
                             "Can not find variable '%s' in the scope.",

From 081b7824349f5a38e0437aae218392014f9f20c0 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Sun, 25 Mar 2018 11:18:49 +0800
Subject: [PATCH 39/58] update by comment

---
 paddle/fluid/operators/send_vars_op.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index af791bc8e2..523e9e2780 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -53,7 +53,7 @@ class SendVarsOp : public framework::OperatorBase {
     auto ins = Inputs("X");
 
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int flag_wait = Attr<int>("wait");
+    int sync_send = Attr<int>("sync_sent");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
@@ -68,12 +68,14 @@ class SendVarsOp : public framework::OperatorBase {
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        // TODO(Yancey1989): we need to use an IO threadpool which has
+        // a larger number of threads than the computing threadpool.
         rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    if (flag_wait) {
+    if (sync_send) {
       rpc_client->Wait();
     }
   }
@@ -86,16 +88,16 @@ class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
     AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
+              "(RPCClient) The RPC client object which will be"
               "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
 This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
-    AddAttr<int>("wait",
+    AddAttr<int>("ync_send",
                  "(int, default 0)"
-                 "whether watting for all send request have been sent.")
+                 "sync send or async send.")
         .SetDefault(0);
     AddAttr<std::vector<std::string>>("epmap",
                                       "(string vector, default 127.0.0.1:6164)"

From 904fa05f4692eebdcebd8b3966a09c162ccd1da4 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 25 Mar 2018 02:29:02 -0700
Subject: [PATCH 40/58] Improve layer_norm speed

    transfomer on a single device step time
    reduces from 0.157 to 0.125
---
 paddle/fluid/operators/layer_norm_op.h | 137 +++++++++++++++++++++----
 1 file changed, 116 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index 605b5c258c..63561aaa31 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -22,6 +22,99 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
+
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
+};
+
+template <typename T>
+class RowwiseMean2D<platform::CUDADeviceContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    framework::DDim ones_dim({right_});
+    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
+    math::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    math::gemv<platform::CUDADeviceContext, T>(
+        context, false, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
+        0., out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  framework::Tensor divisor_;
+};
+
+template <typename T>
+class RowwiseMean2D<platform::CPUDeviceContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    row_mean_(context, input, out);
+  }
+
+ private:
+  math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
+
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
+};
+
+template <typename T>
+class ColwiseSum2D<platform::CUDADeviceContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    framework::DDim ones_dim({left_});
+    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
+    math::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    math::gemv<platform::CUDADeviceContext, T>(
+        context, true, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
+        0., out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  framework::Tensor divisor_;
+};
+
+template <typename T>
+class ColwiseSum2D<platform::CPUDeviceContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    col_wise_(context, input, out);
+  }
+
+ private:
+  math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
+};
+
 template <typename T>
 struct SubAndSquareFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
@@ -67,15 +160,15 @@ using DataLayout = framework::DataLayout;
 template <typename DeviceContext, typename T>
 class LayerNormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto x = *ctx.Input<Tensor>("X");
 
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
 
     const auto x_dims = x.dims();
@@ -94,8 +187,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
     out.ShareDataWith(*y);
     out.Resize(matrix_shape);
 
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::RowwiseMean<DeviceContext, T> row_mean;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
 
     // get mean
     row_mean(dev_ctx, x, mean);
@@ -126,31 +219,32 @@ class LayerNormKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class LayerNormGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
     auto x = *ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
 
     // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
-    const auto &x_dims = x.dims();
+    const auto& x_dims = x.dims();
     auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
     int left = static_cast<int>(matrix_dim[0]);
     int right = static_cast<int>(matrix_dim[1]);
     framework::DDim matrix_shape({left, right});
 
     d_y.Resize(matrix_shape);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::ColwiseSum<DeviceContext, T> colwise_sum;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
+                                               ctx.device_context());
 
     Tensor temp;
     Tensor temp_norm;
@@ -190,7 +284,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
       Tensor temp_vec;
       temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
 
-      math::RowwiseMean<DeviceContext, T> row_mean;
+      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
+                                               ctx.device_context());
 
       if (d_scale) {
         // dy_dx

From 1a4be55a476e2d02dc35fc945220f9aa9c205808 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 25 Mar 2018 02:46:59 -0700
Subject: [PATCH 41/58] Pass cpu build

---
 paddle/fluid/operators/layer_norm_op.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index 63561aaa31..7b84ba0a7d 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -34,6 +34,7 @@ struct RowwiseMean2D {
                   const framework::Tensor& input, framework::Tensor* vec);
 };
 
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 class RowwiseMean2D<platform::CUDADeviceContext, T> {
  public:
@@ -55,6 +56,7 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
   int right_;
   framework::Tensor divisor_;
 };
+#endif
 
 template <typename T>
 class RowwiseMean2D<platform::CPUDeviceContext, T> {
@@ -78,6 +80,7 @@ struct ColwiseSum2D {
                   const framework::Tensor& input, framework::Tensor* vec);
 };
 
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 class ColwiseSum2D<platform::CUDADeviceContext, T> {
  public:
@@ -100,6 +103,7 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
   int right_;
   framework::Tensor divisor_;
 };
+#endif
 
 template <typename T>
 class ColwiseSum2D<platform::CPUDeviceContext, T> {

From f96f2860f9ca88a9967c73179c7d3f198ea778a7 Mon Sep 17 00:00:00 2001
From: wanglun <me@wanglun.org>
Date: Mon, 26 Mar 2018 09:42:07 +0800
Subject: [PATCH 42/58] Fix typo of Softmax document

---
 python/paddle/trainer_config_helpers/activations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 00efc01c05..3683968262 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation):
 
     .. math::
 
-       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} }
+       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
     """
 
     def __init__(self):

From 30f1bd6a6497f05e6e966bdca9af3569e08c0f68 Mon Sep 17 00:00:00 2001
From: Burness Duan <burness1990@gmail.com>
Date: Mon, 26 Mar 2018 10:05:15 +0800
Subject: [PATCH 43/58] add the recordio in creator.py and change the " to \'
 (#9358)

---
 python/paddle/v2/reader/creator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 421f6c933d..fda5246d74 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """
 
-__all__ = ['np_array', 'text_file', "cloud_reader"]
+__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
 
 
 def np_array(x):

From 8ccc61f33490ae2136d234b16c8e64578f9efeee Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Mar 2018 10:05:38 +0800
Subject: [PATCH 44/58] support empty tensor (#9338)

* support empty tensor
---
 paddle/fluid/framework/tensor_impl.h          |  8 ++++----
 paddle/fluid/memory/memory_test.cc            |  4 ++--
 .../fluid/tests/unittests/test_tensor.py      | 20 ++++++++++++++++++-
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 638bd0db9d..7a48390440 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -117,10 +117,10 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
-  PADDLE_ENFORCE_GT(
-      numel(), 0,
-      "When calling this method, the Tensor's numel must be larger than zero. "
-      "Please check Tensor::Resize has been called first.");
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
   int64_t size = numel() * SizeOfType(type);
   /* some versions of boost::variant don't have operator!= */
   if (holder_ == nullptr || !(holder_->place() == place) ||
diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc
index ae98d0d525..eb27a52b25 100644
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
     ps[paddle::memory::Alloc(cpu, size)] = size;
 
     // Buddy Allocator doesn't manage too large memory chunk
@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
     ps[paddle::memory::Alloc(gpu, size)] = size;
 
     // Buddy Allocator doesn't manage too large memory chunk
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index a369783245..379081c328 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase):
     def test_lod_tensor_gpu_init(self):
         if not core.is_compiled_with_cuda():
             return
-        scope = core.Scope()
         place = core.CUDAPlace(0)
         lod_py = [[0, 2, 5], [0, 2, 4, 5]]
         lod_tensor = core.LoDTensor()
@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase):
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
         self.assertListEqual(lod_py, lod_tensor.lod())
 
+    def test_empty_tensor(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        var = scope.var("test_tensor")
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([0, 1])
+        tensor.alloc_float(place)
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((0, 1), tensor_array.shape)
+
+        if core.is_compiled_with_cuda():
+            gpu_place = core.CUDAPlace(0)
+            tensor.alloc_float(gpu_place)
+            tensor_array = numpy.array(tensor)
+            self.assertEqual((0, 1), tensor_array.shape)
+
 
 if __name__ == '__main__':
     unittest.main()

From ebbb428db99ab68dca496ec908442d26a47d2dfd Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 26 Mar 2018 10:46:01 +0800
Subject: [PATCH 45/58] fix ci

---
 paddle/fluid/operators/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9a8f52b232..035ecd0948 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -188,7 +188,7 @@ if(WITH_DISTRIBUTE)
     set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 
 op_library(cond_op DEPS framework_proto tensor net_op)

From 4f522fa8d543715d9fcc633e79714302f496439c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Mar 2018 11:38:06 +0800
Subject: [PATCH 46/58] fix compile send_op on mac (#9360)

---
 paddle/fluid/operators/detail/grpc_client.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index eb19685aa6..e73bbe7537 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -49,9 +49,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     s->Prepare(var_h, time_out);
     s->response_call_back_ = NULL;
 
-    auto call = std::move(s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req,
-        &cq_));
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, (void*)s);
   });
@@ -107,8 +106,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
     ::grpc::ByteBuffer buf;
     RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    auto call = std::move(s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_));
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, (void*)s);
   });

From d573195dde9dfe64724b536654760e2f954f42b3 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 26 Mar 2018 12:46:50 +0800
Subject: [PATCH 47/58] rm libmklml_gnu.so

---
 cmake/inference_lib.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index fb81498fd6..0323cd9698 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -69,11 +69,11 @@ if(NOT CBLAS_FOUND)
       SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
       DSTS ${dst_dir} ${dst_dir}
     )
-else()
+elseif (WITH_MKLML)
     set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
     copy(mklml_lib
-      SRCS ${MKLML_LIB_DIR} ${MKLML_INC_DIR}
-      DSTS ${dst_dir} ${dst_dir}
+      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
     )
 endif()
 

From 54a85b7bfd1836585ed6f257ed67651e0d516557 Mon Sep 17 00:00:00 2001
From: dragonwarrior <dra120126@163.com>
Date: Mon, 26 Mar 2018 13:24:10 +0800
Subject: [PATCH 48/58] Add lrn layer (#9157)

* add LRN layer for fluid

* add LRN layer for fluid

* add documentation for LRN layer

* add paper reference for LRN layer

* add seperate documentation for LRN layer

* rm lrn.py in doc/fluid/dev/src

* change code style in lrn

* fix style of comments in lrn
---
 python/paddle/fluid/layers/nn.py              | 71 +++++++++++++++++++
 .../fluid/tests/unittests/test_layers.py      |  7 ++
 2 files changed, 78 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 679de6ce2a..2db4e5d27d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -74,6 +74,7 @@ __all__ = [
     'one_hot',
     'autoincreased_step_counter',
     'lod_reset',
+    'lrn',
 ]
 
 
@@ -3410,3 +3411,73 @@ def lod_reset(x, y=None, target_lod=None):
         raise ValueError("y and target_lod should not be both None.")
 
     return out
+
+
+def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
+    """
+    Local Response Normalization Layer. This layer performs a type of
+    "lateral inhibition" by normalizing over local input regions.
+
+    The formula is as follows:
+
+    .. math::
+
+        Output(i, x, y) = Input(i, x, y) / \left(
+        k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
+        (Input(j, x, y))^2 \right)^{\beta}
+
+    In the above equation:
+
+    * :math:`n`: The number of channels to sum over.
+    * :math:`k`: The offset (avoid being divided by 0).
+    * :math:`alpha`: The scaling parameter.
+    * :math:`beta`: The exponent parameter.
+
+    Refer to `ImageNet Classification with Deep Convolutional Neural Networks
+    <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
+
+    Args:
+        input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4.
+        n (int, default 5): The number of channels to sum over.
+        k (float, default 1.0): An offset (usually positive to avoid dividing by 0).
+        alpha (float, default 1e-4): The scaling parameter.
+        beta (float, default 0.75): The exponent.
+        name (str, default None): A name for this operation.
+
+    Raises:
+        ValueError: If rank of the input tensor is not 4.
+
+    Returns:
+        A tensor variable storing the transformation result.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
+          lrn = fluid.layers.lrn(input=data)
+    """
+    helper = LayerHelper('lrn', **locals())
+    dtype = helper.input_dtype()
+    input_shape = input.shape
+    dims = len(input_shape)
+
+    if dims != 4:
+        raise ValueError(
+            "dims of input must be 4(not %d), and it's order must be NCHW" %
+            (dims))
+
+    mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    lrn_out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="lrn",
+        inputs={"X": input},
+        outputs={
+            "Out": lrn_out,
+            "MidOut": mid_out,
+        },
+        attrs={"n": n,
+               "k": k,
+               "alpha": alpha,
+               "beta": beta})
+
+    return lrn_out
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index b5fd59cf3a..2179826d81 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.softmax(hid))
         print(str(program))
 
+    def test_lrn(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[6, 2, 2], dtype='float32')
+            self.assertIsNotNone(layers.lrn(data))
+        print(str(program))
+
     def test_get_places(self):
         program = Program()
         with program_guard(program):

From 39004080f4f5358890dc7dcf1be1339ba0efd7b4 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 26 Mar 2018 16:52:30 +0800
Subject: [PATCH 49/58] replace use_pinned with is_pinned

---
 paddle/fluid/framework/tensor.h               | 24 +++++++++----------
 paddle/fluid/framework/tensor_impl.h          | 22 ++++++++---------
 .../fluid/memory/detail/system_allocator.cc   |  7 +++---
 paddle/fluid/memory/memory.cc                 | 12 +++++-----
 paddle/fluid/memory/memory.h                  | 14 +++++------
 5 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index aa8f44ea30..f7a6b5ba84 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -45,11 +45,11 @@ class Tensor {
   friend struct EigenVector;
 
  public:
-  Tensor() : offset_(0), use_pinned_(false) {}
+  Tensor() : offset_(0), is_pinned_(false) {}
 
   /*! Constructor with place should only be used in pybind. */
   explicit Tensor(const platform::Place& place)
-      : offset_(0), use_pinned_(false) {
+      : offset_(0), is_pinned_(false) {
     holder_->set_place(place);
   }
 
@@ -70,12 +70,12 @@ class Tensor {
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place, bool use_pinned = false);
+  inline T* mutable_data(platform::Place place, bool is_pinned = false);
 
   inline void* mutable_data(platform::Place place, std::type_index type,
-                            bool use_pinned = false);
+                            bool is_pinned = false);
 
-  inline void* mutable_data(platform::Place place, bool use_pinned = false);
+  inline void* mutable_data(platform::Place place, bool is_pinned = false);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -87,7 +87,7 @@ class Tensor {
    */
   template <typename T>
   inline T* mutable_data(DDim dims, platform::Place place,
-                         bool use_pinned = false);
+                         bool is_pinned = false);
 
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
@@ -153,13 +153,13 @@ class Tensor {
   template <typename Place>
   struct PlaceholderImpl : public Placeholder {
     PlaceholderImpl(Place place, size_t size, std::type_index type,
-                    bool use_pinned = false)
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, use_pinned)),
-               memory::PODDeleter<uint8_t, Place>(place, use_pinned)),
+                    bool is_pinned = false)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
+               memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
           place_(place),
           size_(size),
           type_(type),
-          use_pinned_(use_pinned) {
+          is_pinned_(is_pinned) {
       PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                               (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
@@ -184,7 +184,7 @@ class Tensor {
     std::type_index type_;
 
     /*! use pinned memory or not. */
-    bool use_pinned_;
+    bool is_pinned_;
   };
 
   /*! holds the memory block if allocated. */
@@ -219,7 +219,7 @@ class Tensor {
    *          PlaceHolder::ptr_ and where the tensor data really begins.
    */
   size_t offset_;
-  bool use_pinned_;
+  bool is_pinned_;
 };
 
 inline void Tensor::switch_place(platform::Place new_place) {
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index e882cce69e..08e2f1a95b 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -102,20 +102,20 @@ inline T* Tensor::data() {
 
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
-                               bool use_pinned) {
+                               bool is_pinned) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
-  return mutable_data<T>(place, use_pinned);
+  return mutable_data<T>(place, is_pinned);
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, bool use_pinned) {
+inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), use_pinned));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
 }
 
 inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
-                                  bool use_pinned) {
+                                  bool is_pinned) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
@@ -129,27 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
       holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type, use_pinned));
+          boost::get<platform::CPUPlace>(place), size, type, is_pinned));
     } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
       PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
     }
 #else
       holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type, use_pinned));
+          boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
     }
 #endif
     offset_ = 0;
-    use_pinned_ = use_pinned;
+    is_pinned_ = is_pinned;
   }
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  offset_);
 }
 
-inline void* Tensor::mutable_data(platform::Place place, bool use_pinned) {
+inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
                  "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type(), use_pinned);
+  return mutable_data(place, holder_->type(), is_pinned);
 }
 
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -191,7 +191,7 @@ inline const DDim& Tensor::dims() const { return dims_; }
 
 inline int64_t Tensor::numel() const { return product(dims_); }
 
-inline bool Tensor::isPinned() const { return use_pinned_; }
+inline bool Tensor::isPinned() const { return is_pinned_; }
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index df9d28ede8..62a75c8196 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -123,8 +123,9 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
   if (size <= 0) return nullptr;
   void* p;
   // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
-  // of host fallback allocation. Allocates too much would reduce
+  // of host pinned allocation. Allocates too much would reduce
   // the amount of memory available to the underlying system for paging.
+  // Because the memory is in CPU side, other device can access it too.
 
   size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
 
@@ -149,10 +150,10 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   err = cudaFreeHost(p);
 
   // Purposefully allow cudaErrorCudartUnloading, because
-  // that is returned if you ever call cudaFree after the
+  // that is returned if you ever call cudaFreeHost after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
-  // cudaFree succeeds.
+  // cudaFreeHost succeeds.
   if (err != cudaErrorCudartUnloading) {
     PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
   }
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
index c5577587aa..f2d5f250bf 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -39,7 +39,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 
 template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
-                                bool use_pinned) {
+                                bool is_pinned) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
   VLOG(10) << "  pointer=" << p;
@@ -48,7 +48,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
 
 template <>
 void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
-                              bool use_pinned) {
+                              bool is_pinned) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
@@ -115,9 +115,9 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
 
 template <>
 void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
-                                 bool use_pinned) {
+                                 bool is_pinned) {
   void* ptr;
-  if (use_pinned) {
+  if (is_pinned) {
     auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
     ptr = buddy_allocator->Alloc(size);
   } else {
@@ -143,8 +143,8 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
 
 template <>
 void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
-                               bool use_pinned) {
-  if (use_pinned) {
+                               bool is_pinned) {
+  if (is_pinned) {
     GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
   } else {
     GetGPUBuddyAllocator(place.device)->Free(p);
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
index 9bc48ac68f..062bfc880e 100644
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -33,7 +33,7 @@ namespace memory {
  *          address is valid or not.
  */
 template <typename Place>
-void* Alloc(Place place, size_t size, bool use_pinned = false);
+void* Alloc(Place place, size_t size, bool is_pinned = false);
 
 /**
  * \brief   Free memory block in one place.
@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size, bool use_pinned = false);
  *
  */
 template <typename Place>
-void Free(Place place, void* ptr, bool use_pinned = false);
+void Free(Place place, void* ptr, bool is_pinned = false);
 
 /**
  * \brief   Total size of used memory in one place.
@@ -74,15 +74,13 @@ class PODDeleter {
   static_assert(std::is_pod<T>::value, "T must be POD");
 
  public:
-  explicit PODDeleter(Place place, bool use_pinned = false)
-      : place_(place), use_pinned_(use_pinned) {}
-  void operator()(T* ptr) {
-    Free(place_, static_cast<void*>(ptr), use_pinned_);
-  }
+  explicit PODDeleter(Place place, bool is_pinned = false)
+      : place_(place), is_pinned_(is_pinned) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
 
  private:
   Place place_;
-  bool use_pinned_;
+  bool is_pinned_;
 };
 
 /**

From 9e99446e250e071c3d086e0c945374c4498e5aeb Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 26 Mar 2018 18:19:24 +0800
Subject: [PATCH 50/58] Add note for cudaMallocHost

---
 paddle/fluid/memory/detail/system_allocator.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 62a75c8196..71d28dcbad 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -119,18 +119,20 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool GPUAllocator::UseGpu() const { return true; }
 
+// PINNED memory allows direct DMA transfers by the GPU to and from system
+// memory. It’s locked to a physical address.
 void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
   if (size <= 0) return nullptr;
   void* p;
   // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
   // of host pinned allocation. Allocates too much would reduce
   // the amount of memory available to the underlying system for paging.
-  // Because the memory is in CPU side, other device can access it too.
 
   size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
 
   if (size > usable) return nullptr;
 
+  // PINNED memory is visible to all CUDA contexts.
   cudaError_t result = cudaMallocHost(&p, size);
   if (result == cudaSuccess) {
     index = 1;

From f3dc3112cce45bbe30d292ffcc9103105222f05c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Mar 2018 20:17:16 +0800
Subject: [PATCH 51/58] add split ids op (#9370)

* add split_ids_op

* add TestSplitIdsOp

* fix comment

* add test for empty tensor

* clean code

* rm unused code
---
 paddle/fluid/operators/split_ids_op.cc        | 76 +++++++++++++++++++
 paddle/fluid/operators/split_ids_op.h         | 65 ++++++++++++++++
 .../tests/unittests/test_split_ids_op.py      | 35 +++++++++
 3 files changed, 176 insertions(+)
 create mode 100644 paddle/fluid/operators/split_ids_op.cc
 create mode 100644 paddle/fluid/operators/split_ids_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_split_ids_op.py

diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
new file mode 100644
index 0000000000..a54f8a2878
--- /dev/null
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_ids_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitIdsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
+        .AsDuplicable();
+
+    AddComment(R"DOC(
+Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
+Example:
+  Input:
+    X = [1,2,3,4,5,6]
+
+  Out(3 output):
+    out0 = [3, 6]
+    out1 = [1, 4]
+    out2 = [2, 5]
+)DOC");
+  }
+};
+
+class SplitIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
+
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
+
+    auto ids_dims = ctx->GetInputDim("Ids");
+    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+  }
+};
+
+class SplitIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
+                  ops::SplitIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
new file mode 100644
index 0000000000..3e750ed2d1
--- /dev/null
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("SplitIds do not support GPU kernel");
+    }
+
+    const auto* ids_t = ctx.Input<framework::LoDTensor>("Ids");
+    auto& ids_dims = ids_t->dims();
+    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+
+    const T* ids = ids_t->data<T>();
+
+    const size_t shard_num = outs.size();
+
+    std::vector<std::vector<T>> out_ids;
+    out_ids.resize(outs.size());
+
+    // split id by their shard_num.
+    for (size_t i = 0; i < ids_dims[0]; ++i) {
+      T id = ids[i];
+      size_t shard_id = static_cast<size_t>(id) % shard_num;
+      out_ids[shard_id].push_back(id);
+    }
+
+    // create tensor for each shard and send to parameter server
+    for (size_t i = 0; i < out_ids.size(); ++i) {
+      auto* shard_t = outs[i];
+      std::vector<T> ids = out_ids[i];
+      auto* shard_data = shard_t->mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      for (size_t i = 0; i < ids.size(); ++i) {
+        shard_data[i] = ids[i];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
new file mode 100644
index 0000000000..e9f0a06a56
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSplitIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "split_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        out0 = np.array([[0], [3], [6]]).astype('int64')
+        out1 = np.array([[]]).astype('int64')
+        out2 = np.array([[2], [2], [5], [5]]).astype('int64')
+        self.inputs = {'Ids': ids}
+        self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6a97c02e56120893ed0c4ca0dfbd45c1a358935e Mon Sep 17 00:00:00 2001
From: legend06hvl <legend06hvl@hotmail.com>
Date: Tue, 27 Mar 2018 02:41:41 +0800
Subject: [PATCH 52/58] Update index_en.rst (#9321)

* Update index_en.rst

New file

* Update index_en.rst

Fix refer to suggestions
---
 doc/v2/dev/index_en.rst | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst
index 549f5fa9aa..36516b7953 100644
--- a/doc/v2/dev/index_en.rst
+++ b/doc/v2/dev/index_en.rst
@@ -1,9 +1,27 @@
 Development
 ------------
 
+
+PaddlePaddle adheres to the following three sections of code and document specifications.
+
+
+PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
 ..  toctree::
   :maxdepth: 1
 
   contribute_to_paddle_en.md
+
+
+PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   write_docs_en.rst
+
+PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
+
+..  toctree::
+  :maxdepth: 1
+
   new_layer_en.rst

From ab5a3560dcda21c3886a1aebc83e3967de35ab4e Mon Sep 17 00:00:00 2001
From: Thuan Nguyen <cs2be@yahoo.com>
Date: Mon, 26 Mar 2018 17:17:40 -0700
Subject: [PATCH 53/58] Create go_op design doc (#9389)

* Create go_op design doc
---
 doc/fluid/design/concurrent/go_op.md | 231 +++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 doc/fluid/design/concurrent/go_op.md

diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000..c18b788e80
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread.  It works in conjuction with CSP operators (channel_send, 
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+    # Send a tensor of value 99 to "channel" on a detached thread
+    tensor = fill_constant(shape=[1], dtype='int', value=99)
+    tensor.stop_gradient = True
+    fluid.channel_send(channel, tensor)
+    
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)    
+fluid.channel_recv(ch, result)  
+```
+
+The go operator can be accessed by using the fluid.Go() control flow.  This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient 
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block.  Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope.  Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below.  Take note of
+the **go_op** in particular.  It is added as an operator in the current 
+block (in this example, block0).  The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a 
+detached thread.
+
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "return_value"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: INT64
+        }
+      }
+    }
+  }
+  vars {
+    name: "status_recv"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "channel"
+    }
+    type: "channel_create"
+    attrs {
+      name: "data_type"
+      type: INT
+      i: 7
+    }
+    attrs {
+      name: "capacity"
+      type: INT
+      i: 0
+    }
+  }
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "channel"
+    }
+    type: "go"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "return_value"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status_recv"
+    }
+    type: "channel_recv"
+  }
+  ...
+}
+
+blocks {
+  idx: 1
+  parent_idx: 0
+  vars {
+    name: "status"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 99.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 3
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status"
+    }
+    type: "channel_send"
+    attrs {
+      name: "copy"
+      type: BOOLEAN
+      b: false
+    }
+  }
+```
+
+## Current Limitations
+
+#### <a name="block-captures"></a>Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block.  When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new 
+child block.  After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited.  If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault.  As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed.  Currently, the go_op will explicitly enforce 
+this requirement and raise an exception if a variable could not be found in 
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to 
+manage multiple threads (instead of natively by the OS).  Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads.  For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation.  Please use go_op with
+non training operators.

From 65534c47625239ce68b5e5c02ae72c3bb1532214 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Mon, 26 Mar 2018 19:11:54 -0700
Subject: [PATCH 54/58] Fluid channels should match the semantics of Go
 Channels (#9265)

* Fluid Channel should match Go Channel in Semantics

* Fix Python channel_send

* Address code rveiew feedback

* Fix open_files_op.cc

* Add description to Channel Asserts
---
 paddle/fluid/framework/channel.h              | 93 +++++++++++--------
 paddle/fluid/framework/channel_impl.h         | 35 ++++---
 paddle/fluid/framework/channel_test.cc        | 93 +++++++++++++++----
 paddle/fluid/operators/channel_send_op.cc     | 25 +----
 .../operators/concurrency/channel_util.cc     | 14 +--
 .../operators/concurrency/channel_util.h      |  2 +-
 .../reader/create_double_buffer_reader_op.cc  |  4 +-
 .../fluid/operators/reader/open_files_op.cc   |  9 +-
 python/paddle/fluid/concurrency.py            | 15 +--
 9 files changed, 172 insertions(+), 118 deletions(-)

diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index adfaba26ac..019bea600f 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -34,7 +34,7 @@ class Channel {
  public:
   virtual bool CanSend() = 0;
   virtual bool CanReceive() = 0;
-  virtual bool Send(T*) = 0;
+  virtual void Send(T*) = 0;
   virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
   virtual void Lock() = 0;
@@ -84,69 +84,81 @@ class ChannelHolder {
   }
 
   template <typename T>
-  bool Send(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+  void Send(T* data) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     // Static cast should be safe because we have ensured that types are same
     Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Send(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    channel->Send(data);
   }
 
   template <typename T>
   bool Receive(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Receive(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    return channel->Receive(data);
   }
 
   bool IsClosed() {
-    if (IsInitialized()) {
-      return holder_->IsClosed();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->IsClosed();
   }
 
   bool CanSend() {
-    if (IsInitialized()) {
-      return holder_->CanSend();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanSend();
   }
 
   bool CanReceive() {
-    if (IsInitialized()) {
-      return holder_->CanReceive();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanReceive();
   }
 
   void close() {
-    if (IsInitialized()) holder_->Close();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Close();
   }
 
   size_t Cap() {
-    if (IsInitialized()) return holder_->Cap();
-    return -1;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->Cap();
   }
 
   void Lock() {
-    if (IsInitialized()) holder_->Lock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Lock();
   }
 
   void Unlock() {
-    if (IsInitialized()) holder_->Unlock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Unlock();
   }
 
   template <typename T>
   void AddToSendQ(const void* referrer, T* data,
                   std::shared_ptr<std::condition_variable_any> cond,
                   std::function<bool(ChannelAction)> cb) {
-    if (IsInitialized()) {
-      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-      if (channel != nullptr) {
-        channel->AddToSendQ(referrer, data, cond, cb);
-      }
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToSendQ(referrer, data, cond, cb);
     }
   }
 
@@ -154,26 +166,31 @@ class ChannelHolder {
   void AddToReceiveQ(const void* referrer, T* data,
                      std::shared_ptr<std::condition_variable_any> cond,
                      std::function<bool(ChannelAction)> cb) {
-    if (IsInitialized()) {
-      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-      if (channel != nullptr) {
-        channel->AddToReceiveQ(referrer, data, cond, cb);
-      }
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToReceiveQ(referrer, data, cond, cb);
     }
   }
 
   void RemoveFromSendQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromSendQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromSendQ(referrer);
   }
 
   void RemoveFromReceiveQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromReceiveQ(referrer);
   }
 
   inline bool IsInitialized() const { return holder_ != nullptr; }
 
   inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
     return holder_->Type();
   }
 
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index 457abbf373..378a0bab1c 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
  public:
   virtual bool CanSend();
   virtual bool CanReceive();
-  virtual bool Send(T *);
+  virtual void Send(T *);
   virtual bool Receive(T *);
   virtual size_t Cap() { return cap_; }
   virtual void Lock();
@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> {
     }
   };
 
-  bool send_return(bool value) {
+  void send_return() {
     send_ctr--;
     destructor_cond_.notify_all();
-    return value;
   }
 
   bool recv_return(bool value) {
@@ -118,15 +117,15 @@ bool ChannelImpl<T>::CanReceive() {
 }
 
 template <typename T>
-bool ChannelImpl<T>::Send(T *item) {
+void ChannelImpl<T>::Send(T *item) {
   send_ctr++;
   std::unique_lock<std::recursive_mutex> lock{mu_};
 
-  // If channel is closed, do nothing
+  // If channel is closed, throw exception
   if (closed_) {
     lock.unlock();
-    // TODO(abhinavarora) Should panic on closed channel
-    return send_return(false);
+    send_return();
+    PADDLE_THROW("Cannot send on closed channel");
   }
 
   // If there is a receiver, directly pass the value we want
@@ -143,7 +142,7 @@ bool ChannelImpl<T>::Send(T *item) {
     if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
     if (do_send)
       *(m->data) = std::move(*item);
-    else
+    else {
       // We cannot do the data transfer because
       // this QueueMessage was added by Select
       // and some other case was executed.
@@ -151,12 +150,17 @@ bool ChannelImpl<T>::Send(T *item) {
       // We do not care about notifying other
       // because they would have been notified
       // by the executed select case.
-      return send_return(Send(item));
+      lock.unlock();
+      Send(item);
+      send_return();
+      return;
+    }
 
     // Wake up the blocked process and unlock
     m->Notify();
     lock.unlock();
-    return send_return(true);
+    send_return();
+    return;
   }
 
   // Unbuffered channel will always bypass this
@@ -167,7 +171,8 @@ bool ChannelImpl<T>::Send(T *item) {
     buf_.push_back(std::move(*item));
     // Release lock and return true
     lock.unlock();
-    return send_return(true);
+    send_return();
+    return;
   }
 
   // Block on channel, because some receiver will complete
@@ -175,8 +180,12 @@ bool ChannelImpl<T>::Send(T *item) {
   auto m = std::make_shared<QueueMessage>(item);
   sendq.push_back(m);
   m->Wait(lock);
-  // TODO(abhinavarora) Should panic on closed channel
-  return send_return(!m->chan_closed);
+  if (m->chan_closed) {
+    lock.unlock();
+    send_return();
+    PADDLE_THROW("Cannot send on closed channel");
+  }
+  send_return();
 }
 
 template <typename T>
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 73be5cdbe2..e2380bb54b 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <chrono>
 #include <thread>
-
 #include "gtest/gtest.h"
 
 using paddle::framework::Channel;
@@ -41,7 +40,7 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
   unsigned sum_send = 0;
   std::thread t([&]() {
     for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+      ch->Send(&i);
       sum_send += i;
     }
   });
@@ -61,7 +60,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel<size_t>(buffer_size);
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);
   }
 
   size_t out;
@@ -82,7 +81,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
   const size_t data = 5;
   std::thread send_thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);  // should not block
   }};
 
   std::thread recv_thread{[&]() {
@@ -94,12 +93,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
   send_thread.join();
   recv_thread.join();
 
-  // After closing send should return false. Receive should
-  // also return false as there is no data in queue.
+  // After closing send should panic. Receive should
+  // also  false as there is no data in queue.
   CloseChannel(ch);
   send_thread = std::thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), false);  // should return false
+    bool is_exception = false;
+    try {
+      ch->Send(&i);
+    } catch (paddle::platform::EnforceNotMet e) {
+      is_exception = true;
+    }
+    EXPECT_EQ(is_exception, true);
   }};
   recv_thread = std::thread{[&]() {
     size_t i;
@@ -129,7 +134,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
   auto ch = MakeChannel<size_t>(buffer_size);
 
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+    ch->Send(&i);  // sending should not block
   }
 
   size_t out;
@@ -160,9 +165,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
       if (i < buffer_size)
-        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
-      else
-        EXPECT_EQ(ch->Send(&i), false);
+        ch->Send(&i);  // should block after 10 iterations
+      else {
+        bool is_exception = false;
+        try {
+          ch->Send(&i);
+        } catch (paddle::platform::EnforceNotMet e) {
+          is_exception = true;
+        }
+        EXPECT_EQ(is_exception, true);
+      }
     }
   });
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
@@ -231,7 +243,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -316,8 +334,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
     // Try to send more number of times
     // than receivers
     for (int i = 0; i < 4; i++) {
-      ch->Send(&i);
-      sum_send += i;
+      try {
+        ch->Send(&i);
+        sum_send += i;
+      } catch (paddle::platform::EnforceNotMet e) {
+      }
     }
   });
   for (int i = 0; i < 3; i++) {
@@ -382,7 +403,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -508,7 +535,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
   unsigned sum_send = 0;
   std::thread t([&]() {
     for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+      ch->Send(&i);
       sum_send += i;
     }
   });
@@ -541,8 +568,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
   ChannelHolder *ch = new ChannelHolder();
   EXPECT_EQ(ch->IsInitialized(), false);
   int i = 10;
-  EXPECT_EQ(ch->Send(&i), false);
-  EXPECT_EQ(ch->Receive(&i), false);
+  bool send_exception = false;
+  try {
+    ch->Send(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    send_exception = true;
+  }
+  EXPECT_EQ(send_exception, true);
+
+  bool recv_exception = false;
+  try {
+    ch->Receive(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    recv_exception = true;
+  }
+  EXPECT_EQ(recv_exception, true);
+
   bool is_exception = false;
   try {
     ch->Type();
@@ -669,7 +710,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -760,7 +807,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc
index 47cf7d7efc..66d33617ed 100644
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
@@ -23,21 +23,10 @@ limitations under the License. */
 
 static constexpr char Channel[] = "Channel";
 static constexpr char X[] = "X";
-static constexpr char Status[] = "Status";
-static constexpr char copy[] = "copy";
 
 namespace paddle {
 namespace operators {
 
-void SetSendStatus(const platform::Place &dev_place,
-                   framework::Variable &status_var, bool status) {
-  auto cpu = platform::CPUPlace();
-  auto status_tensor =
-      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                        cpu);
-  status_tensor[0] = status;
-}
-
 class ChannelSendOp : public framework::OperatorBase {
  public:
   ChannelSendOp(const std::string &type,
@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase {
                    "Input(Channel) of ChannelSendOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(X),
                    "Input(X) of ChannelSendOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Status),
-                   "Output(Status) of ChannelSendOp should not be null.");
-    ctx->SetOutputDim("Status", {1});
   }
 
  private:
@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase {
     auto input_var = scope.FindVar(Input(X));
 
     // Send the input data through the channel.
-    bool ok = concurrency::ChannelSend(ch, input_var);
-
-    // Set the status output of the `ChannelSend` call.
-    SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+    concurrency::ChannelSend(ch, input_var);
   }
 };
 
@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
     AddInput(X, "(Variable) The value which gets sent by the channel.")
         .AsDuplicable();
-    AddOutput(Status,
-              "(Tensor) An LoD Tensor that returns a boolean status of the"
-              "result of the send operation.")
-        .AsDuplicable();
-    AddAttr<bool>(copy, "(bool, default false) Should copy before send")
-        .SetDefault(false);
     AddComment(R"DOC(
 )DOC");
   }
diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc
index a483af7aff..246c99489c 100644
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
@@ -17,20 +17,20 @@ limitations under the License. */
 
 namespace poc = paddle::operators::concurrency;
 
-bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
   auto type = framework::ToVarType(var->Type());
   if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Send(var->GetMutable<framework::LoDTensor>());
+    ch->Send(var->GetMutable<framework::LoDTensor>());
   else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Send(var->GetMutable<framework::LoDRankTable>());
+    ch->Send(var->GetMutable<framework::LoDRankTable>());
   else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Send(var->GetMutable<framework::LoDTensorArray>());
+    ch->Send(var->GetMutable<framework::LoDTensorArray>());
   else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Send(var->GetMutable<framework::SelectedRows>());
+    ch->Send(var->GetMutable<framework::SelectedRows>());
   else if (type == framework::proto::VarType_Type_READER)
-    return ch->Send(var->GetMutable<framework::ReaderHolder>());
+    ch->Send(var->GetMutable<framework::ReaderHolder>());
   else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Send(var->GetMutable<framework::ChannelHolder>());
+    ch->Send(var->GetMutable<framework::ChannelHolder>());
   else
     PADDLE_THROW("ChannelSend:Unsupported type");
 }
diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h
index c3674bd981..cd18ca78c6 100644
--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ b/paddle/fluid/operators/concurrency/channel_util.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 namespace concurrency {
 
-bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
+void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
 bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
 
 void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 76cdb794cc..141a3eb935 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -166,7 +166,9 @@ void DoubleBufferReader::PrefetchThreadFunc() {
       std::swap(gpu_batch, batch.payloads_);
     }
 
-    if (!buffer_->Send(&batch)) {
+    try {
+      buffer_->Send(&batch);
+    } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 414c76fea0..b6ac7b21d5 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name,
   while (reader->HasNext()) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    if (!buffer_->Send(&ins)) {
+    try {
+      buffer_->Send(&ins);
+    } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
                  "thread of file '"
               << file_name << "' will terminate.";
       break;
     }
   }
-  if (!available_thread_idx_->Send(&thread_idx)) {
+
+  try {
+    available_thread_idx_->Send(&thread_idx);
+  } catch (paddle::platform::EnforceNotMet e) {
     VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
                "Fail to send thread_idx.";
   }
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index d65e1a6858..a0f5ef2329 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -339,11 +339,6 @@ def channel_send(channel, value, is_copy=False):
     main_program = helper.main_program
     channel_send_block = main_program.current_block()
 
-    status = helper.create_variable(
-        name=unique_name.generate('status'),
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        dtype=core.VarDesc.VarType.BOOL)
-
     X = value
 
     if is_copy is True:
@@ -359,15 +354,11 @@ def channel_send(channel, value, is_copy=False):
             type="assign_op", inputs={"X": value}, outputs={"Out": copied_X})
         X = copied_X
 
-    channel_send_op = channel_send_block.append_op(
-        type="channel_send",
-        inputs={
+    channel_send_block.append_op(
+        type="channel_send", inputs={
             "Channel": channel,
             "X": X,
-        },
-        outputs={"Status": status})
-
-    return status
+        })
 
 
 def channel_recv(channel, return_value):

From c7bf77d0e14ca1ec8caac53badb4f80adb8b02d1 Mon Sep 17 00:00:00 2001
From: Thuan Nguyen <cs2be@yahoo.com>
Date: Mon, 26 Mar 2018 19:18:21 -0700
Subject: [PATCH 55/58] Add in is_copy attribute to SelectCase. (#9393)

This is a temporary solution to allowing for variables to be copied during a channel send operations.  Also fixed issue with is_copy for "channel_send" method, and also updated unit tests.
---
 python/paddle/fluid/concurrency.py            | 41 ++++++++++++++-----
 python/paddle/fluid/tests/test_concurrency.py | 23 ++---------
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index a0f5ef2329..470dd0df52 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -82,11 +82,14 @@ class SelectCase(object):
     RECEIVE = 2
 
     def __init__(self,
+                 select,
                  case_idx,
                  case_to_execute,
                  channel_action_fn=None,
                  channel=None,
-                 value=None):
+                 value=None,
+                 is_copy=False):
+        self.select = select
         self.helper = LayerHelper('conditional_block')
         self.main_program = self.helper.main_program
         self.is_scalar_condition = True
@@ -99,7 +102,24 @@ class SelectCase(object):
         self.action = (self.SEND
                        if channel_action_fn.__name__ == ('channel_send') else
                        self.RECEIVE) if channel_action_fn else self.DEFAULT
-        self.value = value
+
+        X = value
+        if self.action == self.SEND and is_copy:
+            # We create of copy of the data we want to send
+            copied_X = self.select.parent_block.create_var(
+                name=unique_name.generate(value.name + '_copy'),
+                type=value.type,
+                dtype=value.dtype,
+                shape=value.shape,
+                lod_level=value.lod_level,
+                capacity=value.capacity
+                if hasattr(value, 'capacity') else None, )
+
+            self.select.parent_block.append_op(
+                type="assign", inputs={"X": value}, outputs={"Out": copied_X})
+            X = copied_X
+
+        self.value = X
         self.channel = channel
 
     def __enter__(self):
@@ -173,6 +193,7 @@ class SelectCase(object):
 class Select(BlockGuard):
     def __init__(self, name=None):
         self.helper = LayerHelper('select', name=name)
+        self.parent_block = self.helper.main_program.current_block()
         self.cases = []
 
         super(Select, self).__init__(self.helper.main_program)
@@ -183,12 +204,12 @@ class Select(BlockGuard):
         super(Select, self).__enter__()
         return self
 
-    def case(self, channel_action_fn, channel, value):
+    def case(self, channel_action_fn, channel, value, is_copy=False):
         """Create a new block for this condition.
         """
-        select_case = SelectCase(
-            len(self.cases), self.case_to_execute, channel_action_fn, channel,
-            value)
+        select_case = SelectCase(self,
+                                 len(self.cases), self.case_to_execute,
+                                 channel_action_fn, channel, value, is_copy)
 
         self.cases.append(select_case)
 
@@ -197,7 +218,7 @@ class Select(BlockGuard):
     def default(self):
         """Create a default case block for this condition.
         """
-        default_case = SelectCase(len(self.cases), self.case_to_execute)
+        default_case = SelectCase(self, len(self.cases), self.case_to_execute)
 
         self.cases.append(default_case)
 
@@ -341,17 +362,17 @@ def channel_send(channel, value, is_copy=False):
 
     X = value
 
-    if is_copy is True:
+    if is_copy:
         copied_X = helper.create_variable(
             name=unique_name.generate(value.name + '_copy'),
             type=value.type,
             dtype=value.dtype,
             shape=value.shape,
             lod_level=value.lod_level,
-            capacity=value.capacity)
+            capacity=value.capacity if hasattr(value, 'capacity') else None)
 
         assign_op = channel_send_block.append_op(
-            type="assign_op", inputs={"X": value}, outputs={"Out": copied_X})
+            type="assign", inputs={"X": value}, outputs={"Out": copied_X})
         X = copied_X
 
     channel_send_block.append_op(
diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/test_concurrency.py
index 924895a9af..e8f6cfb4a9 100644
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase):
                 with while_op.block():
                     result2 = fill_constant(
                         shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-                    x_to_send_tmp = fill_constant(
-                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-
-                    # TODO(abhinav): Need to perform copy when doing a channel send.
-                    #   Once this is complete, we can remove these lines
-                    assign(input=x, output=x_to_send_tmp)
 
                     with fluid.Select() as select:
-                        with select.case(fluid.channel_send, channel,
-                                         x_to_send_tmp):
+                        with select.case(
+                                fluid.channel_send, channel, x, is_copy=True):
                             assign(input=x, output=x_tmp)
                             assign(input=y, output=x)
                             assign(elementwise_add(x=x_tmp, y=y), output=y)
@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase):
                                               core.VarDesc.VarType.LOD_TENSOR,
                                               core.VarDesc.VarType.FP64)
 
-            pong_result = self._create_tensor('pong_return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.FP64)
-
             def ping(ch, message):
-                message_to_send_tmp = fill_constant(
-                    shape=[1], dtype=core.VarDesc.VarType.FP64, value=0)
-
-                assign(input=message, output=message_to_send_tmp)
-                fluid.channel_send(ch, message_to_send_tmp)
+                fluid.channel_send(ch, message, is_copy=True)
 
             def pong(ch1, ch2):
                 fluid.channel_recv(ch1, ping_result)
-                assign(input=ping_result, output=pong_result)
-                fluid.channel_send(ch2, pong_result)
+                fluid.channel_send(ch2, ping_result, is_copy=True)
 
             pings = fluid.make_channel(
                 dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)

From e0b5691e41f8dd28bdbf8d4ca7140824f918bec8 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 27 Mar 2018 11:10:53 +0800
Subject: [PATCH 56/58] Add drop_out_op unit test (#9364)

---
 paddle/fluid/operators/CMakeLists.txt     |  1 +
 paddle/fluid/operators/dropout_op.cu      |  5 +-
 paddle/fluid/operators/dropout_op_test.cc | 96 +++++++++++++++++++++++
 3 files changed, 99 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/dropout_op_test.cc

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9a11e1be70..8341170d68 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -264,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 94382739b5..184c095e48 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -55,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
-    auto X = EigenMatrix<T>::Reshape(*x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
-
     auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
@@ -76,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
           T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
           size, seed, dropout_prob, x_data, mask_data, y_data);
     } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
       Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
     }
   }
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
new file mode 100644
index 0000000000..db97ba4f64
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(dropout);
+
+void Compare(f::Scope& scope, p::DeviceContext& ctx) {
+  // init
+  auto var = scope.Var("X");
+  auto tensor = var->GetMutable<f::LoDTensor>();
+  tensor->Resize({10, 10});
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init.push_back(1.0);
+  }
+
+  TensorFromVector(init, ctx, tensor);
+
+  auto place = ctx.GetPlace();
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  out_tensor->mutable_data<float>(place);  // allocate
+
+  auto mask_var = scope.Var("Mask");
+  auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
+  mask_tensor->Resize({10, 10});
+  mask_tensor->mutable_data<float>(place);  // allocate
+
+  // run
+  f::AttributeMap attrs;
+  float dropout_prob = 0.5;
+  attrs.insert({"fix_seed", 1});
+  attrs.insert({"seed", 3});
+  attrs.insert({"dropout_prob", dropout_prob});
+  auto dropout_op = f::OpRegistry::CreateOp(
+      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
+
+  dropout_op->Run(scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*out_tensor, ctx, &out_vec);
+
+  std::vector<float> std_out = {
+      0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
+      1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
+      1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
+      1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
+
+  EXPECT_EQ(out_vec.size(), std_out.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], std_out[i]);
+  }
+}
+
+TEST(Dropout, CPUDense) {
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  Compare(scope, ctx);
+}
+
+TEST(Dropout, GPUDense) {
+  f::Scope scope;
+  p::CUDAPlace place;
+  p::CUDADeviceContext ctx(place);
+  Compare(scope, ctx);
+}

From 123cf165fb031e8e0e9170c17ba59deb95e9dc76 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Tue, 27 Mar 2018 11:11:24 +0800
Subject: [PATCH 57/58] Set stop_gradient=True for some variables in SSD API.
 (#9396)

---
 python/paddle/fluid/layers/detection.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index cd519e1ee0..3e649dc5fd 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -134,6 +134,7 @@ def detection_output(loc,
     scores = nn.softmax(input=scores)
     scores = ops.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
+    scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
@@ -148,6 +149,7 @@ def detection_output(loc,
             'score_threshold': score_threshold,
             'nms_eta': 1.0
         })
+    nmsed_outs.stop_gradient = True
     return nmsed_outs
 
 
@@ -837,4 +839,6 @@ def multi_box_head(inputs,
         mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
         mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
 
+    box.stop_gradient = True
+    var.stop_gradient = True
     return mbox_locs_concat, mbox_confs_concat, box, var

From 25317bd312124cb3f26a2248c04215591d4e8446 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Tue, 27 Mar 2018 16:32:31 +0800
Subject: [PATCH 58/58] Make the first device share data with the global scope
 in parallel_do_op. (#9398)

---
 paddle/fluid/operators/parallel_do_op.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index 4001b9a130..b28c16b13f 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase {
       PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
                      "Only support parameter type as LoDTensor");
       auto &src = scope.FindVar(param)->Get<LoDTensor>();
-      for (size_t i = 0; i < sub_scopes.size(); ++i) {
+
+      auto *sub_scope0 = sub_scopes[0];
+      auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
+      dst0->ShareDataWith(src);
+
+      for (size_t i = 1; i < sub_scopes.size(); ++i) {
         auto &place = places[i];
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();