try to fix imperative orc unitest error; test=develop

347 changed files with 4767 additions and 12699 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-cmake_minimum_required(VERSION 3.15)
-cmake_policy(VERSION 3.10)
+cmake_minimum_required(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@ -39,6 +38,11 @@ endif()
 if (WITH_GPU  AND WITH_ASCEND)
    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
+# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
+if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
+    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
+       "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
+endif()

 if(WITH_GPU AND NOT APPLE)
    enable_language(CUDA)
@ -57,7 +61,6 @@ if(WITH_MUSL)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()

-
 if(WIN32)
    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)

@ -69,13 +72,6 @@ if(WIN32)
    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")

-    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
-    endif()
-
    if (MSVC_STATIC_CRT)
        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /MTd")
@ -92,7 +88,7 @@ if(WIN32)
            endif()
        endforeach(flag_var)
    endif()
-    
+
    # NOTE(Avin0323): Less parallel count result in faster compilation.
    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
    # windows build turn off warnings, use parallel compiling.
@ -120,10 +116,6 @@ if(WIN32)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")

-    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-        set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
-    endforeach(flag_var)
-
    if (WITH_WIN_DUMP_DBG)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -74,7 +74,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
  set(archs_name_default "Auto")
  list(APPEND archs_names "Auto")

@ -91,7 +91,7 @@ function(select_nvcc_arch_flags out_variable)

  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX ""                        CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
  else()
    unset(CUDA_ARCH_BIN CACHE)
@ -108,8 +108,6 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    set(cuda_arch_bin "75")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
-    set(cuda_arch_bin "80")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@ -177,22 +175,14 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
-  set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()

 if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
@ -208,11 +198,14 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")

-# Set C++14 support
+# Set C++11 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-set(CMAKE_CUDA_STANDARD 14)
+if (NOT WIN32) # windows msvc2015 support c++11 natively.
+    # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
+  set(CMAKE_CUDA_STANDARD 11)
+endif(NOT WIN32)

 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
            message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
        endif()
    endif()
 endmacro()
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -14,15 +14,11 @@

 INCLUDE(ExternalProject)

-IF(WITH_ROCM)
-    add_definitions(-DWARPCTC_WITH_HIP)
-ENDIF()
-
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
+set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@ -53,15 +49,14 @@ ExternalProject_Add(
    BUILD_ALWAYS    1
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
-                    -DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-                    -DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                    -DWITH_GPU=${WITH_GPU}
-                    -DWITH_ROCM=${WITH_ROCM}
                    -DWITH_OMP=${USE_OMP}
                    -DWITH_TORCH=OFF
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
  elseif(WITH_SUNWAY)
      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
  else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_03_30.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE)
  endif()

  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 include(CheckTypeSize)

-function(CheckCompilerCXX14Flag)
+function(CheckCompilerCXX11Flag)
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
            message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
        endif()
@ -20,15 +20,23 @@ function(CheckCompilerCXX14Flag)
                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
            endif()
        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
            endif()
        endif()
    endif()
 endfunction()

-CheckCompilerCXX14Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+CheckCompilerCXX11Flag()
+if (WITH_GPU)
+    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
+       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    endif()
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -492,8 +492,10 @@ function(nv_library TARGET_NAME)
        message(FATAL "Please specify source file or library in nv_library.")
      endif()
    endif(nv_library_SRCS)
-    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
-      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+      if(${MSVC_VERSION} LESS_EQUAL 1900)
+        set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+      endif()
    endif()
  endif()
 endfunction(nv_library)
@ -510,7 +512,7 @@ function(nv_binary TARGET_NAME)
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
      common_link(${TARGET_NAME})
    endif()
-    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
+    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
    endif()
  endif()
@ -537,7 +539,7 @@ function(nv_test TARGET_NAME)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
+    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
    endif()
  endif()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -192,15 +192,6 @@ include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)

 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@ -18,10 +18,6 @@ if(NOT WIN32)
    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
    set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
-    # It has not been used now, it can specify CUDA compile flag manualy,
-    # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
-    # because CUDA will update by nvidia, then error will occur.
-    # Now, it's used in CUDA:[10.0, 10.2]
    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()

--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@ -15,7 +15,7 @@
            <Warning>InheritFromHost</Warning>

            <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
-            <BuildCommandLineTemplate>--use-local-env $(CudaClVersion)</BuildCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env --cl-version $(CudaClVersion)</BuildCommandLineTemplate>
            <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
            <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
--- a/go/README_cn.md
+++ b/go/README_cn.md
@ -50,7 +50,6 @@ output_data := value.Interface().([][]float32)

 运行
 ```bash
-go mod init github.com/paddlepaddle
 export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
 go run ./demo/mobilenet.go
 ```
--- a/go/demo/mobilenet.go
+++ b/go/demo/mobilenet.go
@ -13,7 +13,7 @@
 // limitations under the License.
 package main

-import "github.com/paddlepaddle/paddle"
+import "../paddle"
 import "strings"
 import "io/ioutil"
 import "strconv"
--- a/go/paddle/common.go
+++ b/go/paddle/common.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include <paddle_c_api.h>
 import "C"
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <paddle_c_api.h>
--- a/go/paddle/predictor.go
+++ b/go/paddle/predictor.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include "paddle_c_api.h"
 import "C"
@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string {
 }

 func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetOutputNum())
+	names := make([]string, predictor.GetInputNum())
 	for i := 0; i < len(names); i++ {
 		names[i] = predictor.GetOutputName(i)
 	}
--- a/go/paddle/tensor.go
+++ b/go/paddle/tensor.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <string.h>
@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va
 		value := reflect.Indirect(ptr)
 		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
 		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(0).Kind() {
+			switch value.Index(1).Kind() {
 			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
 				binary.Read(r, Endian(), value.Interface())
 				return
--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/fluid/extension/include/ext_dispatch.h
@ -47,22 +47,6 @@ namespace paddle {
    }                                                                     \
  }()

-#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)                   \
-  [&] {                                                                        \
-    const auto& __dtype__ = TYPE;                                              \
-    switch (__dtype__) {                                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,           \
-                           __VA_ARGS__)                                        \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,          \
-                           __VA_ARGS__)                                        \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT16, paddle::float16, \
-                           __VA_ARGS__)                                        \
-      default:                                                                 \
-        PD_THROW("function " #NAME " is not implemented for data type `",      \
-                 ::paddle::ToString(__dtype__), "`");                          \
-    }                                                                          \
-  }()
-
 ///////// Integral Dispatch Marco ///////////

 #define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \
@ -84,22 +68,6 @@ namespace paddle {
    }                                                                         \
  }()

-///////// Complex Dispatch Marco ///////////
-
-#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                         \
-  [&] {                                                                    \
-    const auto& __dtype__ = TYPE;                                          \
-    switch (__dtype__) {                                                   \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
-                           ::paddle::complex64, __VA_ARGS__)               \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
-                           ::paddle::complex128, __VA_ARGS__)              \
-      default:                                                             \
-        PD_THROW("function " #NAME " is not implemented for data type `" + \
-                 ::paddle::ToString(__dtype__) + "`");                     \
-    }                                                                      \
-  }()
-
 ///////// Floating and Integral Dispatch Marco ///////////

 #define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
@ -125,55 +93,6 @@ namespace paddle {
    }                                                                         \
  }()

-///////// Floating and Complex Dispatch Marco ///////////
-
-#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)            \
-  [&] {                                                                    \
-    const auto& __dtype__ = TYPE;                                          \
-    switch (__dtype__) {                                                   \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,       \
-                           __VA_ARGS__)                                    \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,      \
-                           __VA_ARGS__)                                    \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
-                           ::paddle::complex64, __VA_ARGS__)               \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
-                           ::paddle::complex128, __VA_ARGS__)              \
-      default:                                                             \
-        PD_THROW("function " #NAME " is not implemented for data type `" + \
-                 ::paddle::ToString(__dtype__) + "`");                     \
-    }                                                                      \
-  }()
-
-///////// Floating, Integral and Complex Dispatch Marco ///////////
-
-#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
-                           ::paddle::complex64, __VA_ARGS__)                  \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
-                           ::paddle::complex128, __VA_ARGS__)                 \
-      default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `" +    \
-                 ::paddle::ToString(__dtype__) + "`");                        \
-    }                                                                         \
-  }()
-
 // TODO(chenweihang): Add more Marcos in the future if needed

 }  // namespace paddle
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@ -16,17 +16,10 @@ limitations under the License. */
 #include <cstdint>
 #include <string>

-#include "complex128.h"     // NOLINT
-#include "complex64.h"      // NOLINT
 #include "ext_exception.h"  // NOLINT
-#include "float16.h"        // NOLINT

 namespace paddle {

-using complex64 = paddle::platform::complex64;
-using complex128 = paddle::platform::complex128;
-using float16 = paddle::platform::float16;
-
 enum class DataType {
  BOOL,
  INT8,
@ -34,11 +27,8 @@ enum class DataType {
  INT16,
  INT32,
  INT64,
-  FLOAT16,
  FLOAT32,
  FLOAT64,
-  COMPLEX64,
-  COMPLEX128,
  // TODO(JiabinYang) support more data types if needed.
 };

@ -56,33 +46,24 @@ inline std::string ToString(DataType dtype) {
      return "int32_t";
    case DataType::INT64:
      return "int64_t";
-    case DataType::FLOAT16:
-      return "float16";
    case DataType::FLOAT32:
      return "float";
    case DataType::FLOAT64:
      return "double";
-    case DataType::COMPLEX64:
-      return "complex64";
-    case DataType::COMPLEX128:
-      return "complex128";
    default:
      PD_THROW("Unsupported paddle enum data type.");
  }
 }

-#define PD_FOR_EACH_DATA_TYPE(_)    \
-  _(bool, DataType::BOOL)           \
-  _(int8_t, DataType::INT8)         \
-  _(uint8_t, DataType::UINT8)       \
-  _(int16_t, DataType::INT16)       \
-  _(int, DataType::INT32)           \
-  _(int64_t, DataType::INT64)       \
-  _(float16, DataType::FLOAT16)     \
-  _(float, DataType::FLOAT32)       \
-  _(double, DataType::FLOAT64)      \
-  _(complex64, DataType::COMPLEX64) \
-  _(complex128, DataType::COMPLEX128)
+#define PD_FOR_EACH_DATA_TYPE(_) \
+  _(bool, DataType::BOOL)        \
+  _(int8_t, DataType::INT8)      \
+  _(uint8_t, DataType::UINT8)    \
+  _(int16_t, DataType::INT16)    \
+  _(int, DataType::INT32)        \
+  _(int64_t, DataType::INT64)    \
+  _(float, DataType::FLOAT32)    \
+  _(double, DataType::FLOAT64)

 template <paddle::DataType T>
 struct DataTypeToCPPType;
--- a/paddle/fluid/extension/include/ext_op_meta_info.h
+++ b/paddle/fluid/extension/include/ext_op_meta_info.h
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@ -52,9 +52,6 @@ class PD_DLL_DECL Tensor {
  /// \brief Construct a Tensor on target Place for CustomOp.
  /// Generally it's only used for user to create Tensor.
  explicit Tensor(const PlaceType& place);
-  /// \brief Construct a Tensor on target Place with shape for CustomOp.
-  /// Generally it's only used for user to create Tensor.
-  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
  /// \brief Reset the shape of the tensor.
  /// Generally it's only used for the input tensor.
  /// Reshape must be called before calling
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@ -13,16 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/extension/include/ext_tensor.h"
-
 #include <utility>
-
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"

 namespace paddle {
@ -102,32 +97,13 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,

 void Tensor::reshape(const std::vector<int64_t> &shape) {
  GET_CASTED_TENSOR
-  auto new_dim = framework::make_ddim(shape);
-  if (tensor->numel() != framework::product(new_dim)) {
-    LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger "
-                    "or smaller"
-                 << "than original shape will not change your tensor's memory "
-                    "Please call"
-                 << "paddle::Tensor::mutable_data<T>() after to reallocate "
-                    "your tensor's size."
-                 << std::endl;
-  }
-  tensor->Resize(new_dim);
+  tensor->Resize(framework::make_ddim(shape));
 }

 Tensor::Tensor(const PlaceType &place)
    : tensor_(std::make_shared<framework::LoDTensor>()),
      place_(place),
      stream_(StreamWrapper()) {}
-
-Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
-    : tensor_(std::make_shared<framework::LoDTensor>()),
-      place_(place),
-      stream_(StreamWrapper()) {
-  GET_CASTED_TENSOR
-  tensor->Resize(framework::make_ddim(shape));
-}
-
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
  place_ = place;
@ -186,12 +162,6 @@ DataType Tensor::type() const {
    return DataType::FLOAT64;
  } else if (type == framework::proto::VarType::BOOL) {
    return DataType::BOOL;
-  } else if (type == framework::proto::VarType::COMPLEX64) {
-    return DataType::COMPLEX64;
-  } else if (type == framework::proto::VarType::COMPLEX128) {
-    return DataType::COMPLEX128;
-  } else if (type == framework::proto::VarType::FP16) {
-    return DataType::FLOAT16;
  }
  // TODO(JiabinYang) Support more dtype here
  return DataType::FLOAT32;
@ -247,12 +217,6 @@ template PD_DLL_DECL Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
-    const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
-    const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;

 template PD_DLL_DECL float *Tensor::data<float>() const;
 template PD_DLL_DECL double *Tensor::data<double>() const;
@ -262,12 +226,6 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
 template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
 template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
 template PD_DLL_DECL bool *Tensor::data<bool>() const;
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::data<paddle::platform::complex64>() const;
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::data<paddle::platform::complex128>() const;
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::data<paddle::platform::float16>() const;

 template PD_DLL_DECL float *Tensor::mutable_data<float>();
 template PD_DLL_DECL double *Tensor::mutable_data<double>();
@ -277,12 +235,6 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::mutable_data<paddle::platform::complex64>();
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::mutable_data<paddle::platform::complex128>();
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::mutable_data<paddle::platform::float16>();

 template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
 template PD_DLL_DECL double *Tensor::mutable_data<double>(
@ -298,12 +250,6 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
    const PlaceType &place);
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);

 std::vector<int64_t> Tensor::shape() const {
  GET_CASTED_TENSOR
@ -364,21 +310,6 @@ Tensor Tensor::cast(const DataType &target_type) const {
      framework::VisitDataType(
          dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
      break;
-    case framework::proto::VarType::COMPLEX64:
-      framework::VisitDataType(
-          dst_type,
-          CastDataType<paddle::platform::complex64>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::COMPLEX128:
-      framework::VisitDataType(dst_type,
-                               CastDataType<paddle::platform::complex128>(
-                                   *tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::FP16:
-      framework::VisitDataType(
-          dst_type,
-          CastDataType<paddle::platform::float16>(*tensor, rlt_tensor_, ctx));
-      break;
    // TODO(JiabinYang) Support more dtype here
    default:
      PADDLE_THROW(platform::errors::Unimplemented(
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -346,25 +346,57 @@ message(STATUS "branch: ${PADDLE_BRANCH}")

 configure_file(commit.h.in commit.h)

-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
-
 cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
 cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info)
 cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)

+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
+
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)

 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})

+# Old custom op extension mechanism related, will be removed in 2.1.0
+cc_library(paddle_framework_shared
+    SHARED SRCS executor.cc operator.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
+    DEPS ${FLUID_FRAMEWORK_MODULES})
+get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework)
+target_link_libraries(paddle_framework_shared ${os_dependency_modules})
+
+if (LINUX)
+  set(FLUID_FRAMEWORK_SHARED_LIB
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so
+      CACHE INTERNAL "Fluid framework lib")
+endif()
+
+if (WIN32)
+  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR})
+  else()
+    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
+  endif()
+  set(FLUID_FRAMEWORK_IMPORT_LIB
+      ${paddle_framework_lib_path}/paddle_framework.lib
+      CACHE INTERNAL "Fluid framework lib")
+  set(FLUID_FRAMEWORK_SHARED_LIB
+      ${paddle_framework_lib_path}/paddle_framework.dll
+      CACHE INTERNAL "Fluid framework dll")
+endif()
+
+if(APPLE)
+  set(FLUID_FRAMEWORK_SHARED_LIB
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib
+      CACHE INTERNAL "Fluid framework lib")
+endif()
 if(WITH_TESTING AND TEST selected_rows_test)
  set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()

-##### 2.0 New custom op extension mechanism related #####
+# New custom op extension mechanism related

 # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
 set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@ -0,0 +1,53 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/c/c_api.h"
+
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+extern "C" {
+
+paddle::framework::OpInfoMap &PD_GetOpInfoMap() {
+  return paddle::framework::OpInfoMap::Instance();
+}
+
+void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) {
+  paddle::platform::DeviceContextPool::SetPool(pool);
+}
+
+std::vector<std::string> PD_GetGradOpDescStrs(
+    const paddle::framework::OpDesc &op_desc,
+    const std::unordered_set<std::string> &no_grad_set,
+    std::unordered_map<std::string, std::string> *grad_to_var,
+    const std::vector<paddle::framework::BlockDesc *> &grad_block) {
+  auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type());
+  std::vector<std::string> ret;
+  if (op_info.grad_op_maker_) {
+    auto grad_op_descs =
+        op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block);
+    size_t op_num = grad_op_descs.size();
+    ret.resize(op_num);
+    for (size_t i = 0; i < op_num; ++i) {
+      PADDLE_ENFORCE_EQ(
+          grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
+          paddle::platform::errors::Unavailable(
+              "Cannot serialize operator desc message."));
+    }
+  }
+  return ret;
+}
+
+}  // end extern "C"
--- a/paddle/fluid/framework/c/c_api.h
+++ b/paddle/fluid/framework/c/c_api.h
@ -0,0 +1,55 @@
+/* copyright (c) 2019 paddlepaddle authors. all rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+class OpInfoMap;
+}  // namespace framework
+namespace platform {
+class DeviceContextPool;
+}  // namespace platform
+}  // namespace paddle
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global OpInfo map.
+paddle::framework::OpInfoMap &PD_GetOpInfoMap();
+
+// C-API to init global DeviceContextPool from outside.
+void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool);
+
+// C-API to serialize the grad op protocol message to a binary string.
+std::vector<std::string> PD_GetGradOpDescStrs(
+    const paddle::framework::OpDesc &op_desc,
+    const std::unordered_set<std::string> &no_grad_set,
+    std::unordered_map<std::string, std::string> *grad_to_var,
+    const std::vector<paddle::framework::BlockDesc *> &grad_block);
+
+#ifdef __cplusplus
+}
+#endif
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@ -177,7 +178,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
          "Unsupported `%s` type value as custom attribute now. "
          "Supported data types include `bool`, `int`, `float`, "
          "`int64_t`, `std::string`, `std::vector<int>`, "
-          "`std::vector<float>`, `std::vector<int64_t>`, "
+          "`std::vector<float>`, `std::vector<int64_t>, "
          "`std::vector<std::string>`, Please check whether "
          "the attribute data type and data type string are matched.",
          attr_type_str));
@ -326,7 +327,7 @@ class CustomOpMaker : public OpProtoAndCheckerMaker {
            "Unsupported `%s` type value as custom attribute now. "
            "Supported data types include `bool`, `int`, `float`, "
            "`int64_t`, `std::string`, `std::vector<int>`, "
-            "`std::vector<float>`, `std::vector<int64_t>`, "
+            "`std::vector<float>`, `std::vector<int64_t>, "
            "`std::vector<std::string>`, Please check whether "
            "the attribute data type and data type string are matched.",
            attr_type_str));
@ -580,7 +581,7 @@ void RegisterOperatorWithMetaInfo(
      ctx->ShareDim(op_inputs[0], op_outputs[0]);
    };
  } else {
-    info.infer_shape_ = [op_inputs, op_outputs, op_attrs,
+    info.infer_shape_ = [op_inputs, op_outputs,
                         infer_shape_func](InferShapeContext* ctx) {
      std::vector<std::vector<int64_t>> input_shapes;
      std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;
@ -605,50 +606,8 @@ void RegisterOperatorWithMetaInfo(
        }
      }

-      std::vector<boost::any> custom_attrs;
-      for (auto& attr_str : op_attrs) {
-        auto attr_name_and_type = detail::ParseAttrStr(attr_str);
-        auto attr_name = attr_name_and_type[0];
-        auto attr_type_str = attr_name_and_type[1];
-        if (attr_type_str == "bool") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<bool>(attr_name));
-        } else if (attr_type_str == "int") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<int>(attr_name));
-        } else if (attr_type_str == "float") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<float>(attr_name));
-        } else if (attr_type_str == "int64_t") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<int64_t>(attr_name));
-        } else if (attr_type_str == "std::string") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<std::string>(attr_name));
-        } else if (attr_type_str == "std::vector<int>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<int>>(attr_name));
-        } else if (attr_type_str == "std::vector<float>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<float>>(attr_name));
-        } else if (attr_type_str == "std::vector<int64_t>") {
-          // NOTE(chenweihang): InferShape can't support std::vector<int64_t>
-          // attr type, because the input type is std::vector<int64_t>, only
-          // can use one rule to parse std::vector<int64_t> parameter
-          continue;
-        } else if (attr_type_str == "std::vector<std::string>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<std::string>>(attr_name));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported `%s` type value as custom attribute now. "
-              "Supported data types include `bool`, `int`, `float`, "
-              "`int64_t`, `std::string`, `std::vector<int>`, "
-              "`std::vector<float>`, `std::vector<std::string>`, "
-              "Please check whether the attribute data type and "
-              "data type string are matched.",
-              attr_type_str));
-        }
-      }
-
      VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
-      auto output_shapes =
-          infer_shape_func(input_shapes, vec_input_shapes, custom_attrs);
+      auto output_shapes = infer_shape_func(input_shapes, vec_input_shapes);

      VLOG(1) << "Custom Operator: InferShape - set output ddim.";
      for (size_t i = 0; i < op_outputs.size(); ++i) {
@ -798,39 +757,10 @@ void RegisterOperatorWithMetaInfo(
      return new CustomOperator(type, inputs, outputs, attrs);
    };

-    // Grad InferShape
-    grad_info.infer_shape_ = [grad_op_inputs,
-                              grad_op_outputs](InferShapeContext* ctx) {
-      // 1. if forward input exists, gradient's shape is same with forward input
-      // default
-      //    [Suitable for most situations]
-      // 2. if forward input not exists, and only contains one grad input and
-      // output,
-      //    use grad input shape as grad output shape
-      //    [Suitable for the situation that forward input is not used as
-      //    backward input]
-      // TODO(chenweihang): support set grad op infershape func if needed
+    // Grad InferShape (gradient's shape is same with forward input default)
+    grad_info.infer_shape_ = [grad_op_outputs](InferShapeContext* ctx) {
      for (auto& out_name : grad_op_outputs) {
-        auto fwd_name = detail::NoGrad(out_name);
-        if (detail::IsDuplicableVar(fwd_name)) {
-          // Duplicable forward var must as backward input
-          ctx->ShareDim(fwd_name, out_name);
-        } else {
-          if (ctx->HasInput(fwd_name)) {
-            ctx->ShareDim(fwd_name, out_name);
-          } else {
-            PADDLE_ENFORCE_EQ(
-                grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
-                true,
-                platform::errors::Unavailable(
-                    "Custom grad operator infershape error. "
-                    "If a custom grad operator contains only one input and "
-                    "only one output, the input shape will be directly set to "
-                    "the output shape. Otherwise, Please set the forward input "
-                    "as the grad operator's input."));
-            ctx->ShareDim(grad_op_inputs[0], out_name);
-          }
-        }
+        ctx->ShareDim(detail::NoGrad(out_name), out_name);
      }
    };

--- a/Show More
+++ b/Show More