update compilation with C++14 (#31815 )

* update compilation with C++14, test=develop * fix compilation error in eigen, test=develop
fix whl package push pypi (#31585 )
190 changed files with 6092 additions and 2790 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.15)
+cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@ -38,11 +39,6 @@ endif()
 if (WITH_GPU  AND WITH_ASCEND)
    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
-if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
-    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
-       "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
-endif()

 if(WITH_GPU AND NOT APPLE)
    enable_language(CUDA)
@ -61,7 +57,6 @@ if(WITH_MUSL)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()

-#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zm1000 /fp:fast")

 if(WIN32)
    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -74,7 +74,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
  set(archs_name_default "Auto")
  list(APPEND archs_names "Auto")

@ -108,6 +108,8 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    set(cuda_arch_bin "75")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
+    set(cuda_arch_bin "80")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@ -206,14 +208,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")

-# Set C++11 support
+# Set C++14 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-if (NOT WIN32) # windows msvc2015 support c++11 natively.
-    # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
-  set(CMAKE_CUDA_STANDARD 11)
-endif(NOT WIN32)
+set(CMAKE_CUDA_STANDARD 14)

 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
            message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
        endif()
    endif()
 endmacro()
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -14,11 +14,15 @@

 INCLUDE(ExternalProject)

+IF(WITH_ROCM)
+    add_definitions(-DWARPCTC_WITH_HIP)
+ENDIF()
+
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         cd828e5b6c3b953b82af73f7f44cddc393a20efa)
+set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@ -57,6 +61,7 @@ ExternalProject_Add(
                    -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                    -DWITH_GPU=${WITH_GPU}
+                    -DWITH_ROCM=${WITH_ROCM}
                    -DWITH_OMP=${USE_OMP}
                    -DWITH_TORCH=OFF
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
  elseif(WITH_SUNWAY)
      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
  else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_03_30.tar.gz" CACHE STRING "" FORCE)
  endif()

  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 include(CheckTypeSize)

-function(CheckCompilerCXX11Flag)
+function(CheckCompilerCXX14Flag)
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
        elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
            message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
        endif()
@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag)
                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
            endif()
        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
            endif()
        endif()
    endif()
 endfunction()

-CheckCompilerCXX11Flag()
-if (WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
-    else()
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-    endif()
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
+CheckCompilerCXX14Flag()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
--- a/go/README_cn.md
+++ b/go/README_cn.md
@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32)

 运行
 ```bash
+go mod init github.com/paddlepaddle
 export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
 go run ./demo/mobilenet.go
 ```
--- a/go/demo/mobilenet.go
+++ b/go/demo/mobilenet.go
@ -13,7 +13,7 @@
 // limitations under the License.
 package main

-import "../paddle"
+import "github.com/paddlepaddle/paddle"
 import "strings"
 import "io/ioutil"
 import "strconv"
--- a/go/paddle/common.go
+++ b/go/paddle/common.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <paddle_c_api.h>
 import "C"
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <paddle_c_api.h>
--- a/go/paddle/predictor.go
+++ b/go/paddle/predictor.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include "paddle_c_api.h"
 import "C"
@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string {
 }

 func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetInputNum())
+	names := make([]string, predictor.GetOutputNum())
 	for i := 0; i < len(names); i++ {
 		names[i] = predictor.GetOutputName(i)
 	}
--- a/go/paddle/tensor.go
+++ b/go/paddle/tensor.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <string.h>
@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va
 		value := reflect.Indirect(ptr)
 		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
 		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(1).Kind() {
+			switch value.Index(0).Kind() {
 			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
 				binary.Read(r, Endian(), value.Interface())
 				return
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@ -52,6 +52,9 @@ class PD_DLL_DECL Tensor {
  /// \brief Construct a Tensor on target Place for CustomOp.
  /// Generally it's only used for user to create Tensor.
  explicit Tensor(const PlaceType& place);
+  /// \brief Construct a Tensor on target Place with shape for CustomOp.
+  /// Generally it's only used for user to create Tensor.
+  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
  /// \brief Reset the shape of the tensor.
  /// Generally it's only used for the input tensor.
  /// Reshape must be called before calling
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@ -102,13 +102,32 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,

 void Tensor::reshape(const std::vector<int64_t> &shape) {
  GET_CASTED_TENSOR
-  tensor->Resize(framework::make_ddim(shape));
+  auto new_dim = framework::make_ddim(shape);
+  if (tensor->numel() != framework::product(new_dim)) {
+    LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger "
+                    "or smaller"
+                 << "than original shape will not change your tensor's memory "
+                    "Please call"
+                 << "paddle::Tensor::mutable_data<T>() after to reallocate "
+                    "your tensor's size."
+                 << std::endl;
+  }
+  tensor->Resize(new_dim);
 }

 Tensor::Tensor(const PlaceType &place)
    : tensor_(std::make_shared<framework::LoDTensor>()),
      place_(place),
      stream_(StreamWrapper()) {}
+
+Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
+    : tensor_(std::make_shared<framework::LoDTensor>()),
+      place_(place),
+      stream_(StreamWrapper()) {
+  GET_CASTED_TENSOR
+  tensor->Resize(framework::make_ddim(shape));
+}
+
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
  place_ = place;
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -360,46 +360,11 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot

 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})

-# Old custom op extension mechanism related, will be removed in 2.1.0
-cc_library(paddle_framework_shared
-    SHARED SRCS executor.cc operator.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
-    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
-    DEPS ${FLUID_FRAMEWORK_MODULES})
-get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework)
-target_link_libraries(paddle_framework_shared ${os_dependency_modules})
-
-if (LINUX)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so
-      CACHE INTERNAL "Fluid framework lib")
-endif()
-
-if (WIN32)
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(FLUID_FRAMEWORK_IMPORT_LIB
-      ${paddle_framework_lib_path}/paddle_framework.lib
-      CACHE INTERNAL "Fluid framework lib")
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${paddle_framework_lib_path}/paddle_framework.dll
-      CACHE INTERNAL "Fluid framework dll")
-endif()
-
-if(APPLE)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib
-      CACHE INTERNAL "Fluid framework lib")
-endif()
 if(WITH_TESTING AND TEST selected_rows_test)
  set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()

-# New custom op extension mechanism related
+##### 2.0 New custom op extension mechanism related #####

 # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
 set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@ -1,53 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/c/c_api.h"
-
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-extern "C" {
-
-paddle::framework::OpInfoMap &PD_GetOpInfoMap() {
-  return paddle::framework::OpInfoMap::Instance();
-}
-
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) {
-  paddle::platform::DeviceContextPool::SetPool(pool);
-}
-
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block) {
-  auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type());
-  std::vector<std::string> ret;
-  if (op_info.grad_op_maker_) {
-    auto grad_op_descs =
-        op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block);
-    size_t op_num = grad_op_descs.size();
-    ret.resize(op_num);
-    for (size_t i = 0; i < op_num; ++i) {
-      PADDLE_ENFORCE_EQ(
-          grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          paddle::platform::errors::Unavailable(
-              "Cannot serialize operator desc message."));
-    }
-  }
-  return ret;
-}
-
-}  // end extern "C"
--- a/paddle/fluid/framework/c/c_api.h
+++ b/paddle/fluid/framework/c/c_api.h
@ -1,55 +0,0 @@
-/* copyright (c) 2019 paddlepaddle authors. all rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class OpInfoMap;
-}  // namespace framework
-namespace platform {
-class DeviceContextPool;
-}  // namespace platform
-}  // namespace paddle
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global OpInfo map.
-paddle::framework::OpInfoMap &PD_GetOpInfoMap();
-
-// C-API to init global DeviceContextPool from outside.
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool);
-
-// C-API to serialize the grad op protocol message to a binary string.
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block);
-
-#ifdef __cplusplus
-}
-#endif
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@ -37,7 +37,7 @@ class CustomTensorUtils {
  /// \brief Share data FROM another tensor.
  /// Use this to pass tensor from op to op
  /// \return void.
-  static void ShareDataFrom(const void* src, const Tensor& dst);
+  static void ShareDataFrom(const void* src, const paddle::Tensor& dst);

  static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
      const paddle::DataType& dtype) {
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@ -18,7 +18,6 @@

 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@ -34,15 +34,19 @@ namespace patterns {
 static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name,
                               const std::string& arg,
                               bool is_persist = false) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
  PDNode* node =
-      pattern->NewNode(name)->assert_is_op_input("lookup_table", arg);
+      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
  if (is_persist) return node->assert_is_persistable_var();
  return node;
 }
 static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name,
                                   const std::string& arg) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
  PDNode* node = pattern->NewNode(name)
-                     ->assert_is_only_output_of_op("lookup_table")
+                     ->assert_is_only_output_of_ops(embedding_ops)
                     ->assert_is_op_input("elementwise_add", arg)
                     ->AsIntermediate();
  return node;
@ -56,10 +60,12 @@ void Embedding2Eltwise1Pattern::operator()() {
      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
  auto* lookup_table2_w =
      create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
  auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
  auto* lookup_table2 =
-      pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
  auto* lookup_table1_out =
      create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
  auto* lookup_table2_out =
@ -80,8 +86,10 @@ void Embedding1Eltwise1Pattern::operator()() {
      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
  auto* lookup_table1_w =
      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
  auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
  auto* lookup_table1_out =
      create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
  auto* eltwise_add =
@ -347,4 +355,5 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
    .AddCombination(
        paddle::framework::compatible::OpVersionComparatorCombination()
            .EQ("lookup_table", 0)
+            .LE("lookup_table_v2", 1)
            .EQ("elementweise_add", 0));
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -652,6 +652,36 @@ PDNode *PDNode::assert_is_ops_input(
  return this;
 }

+PDNode *PDNode::assert_is_only_input_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->outputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->inputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
+PDNode *PDNode::assert_is_only_output_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->inputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->outputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
 bool VarLinksToOp(Node *node, const std::string &op_type) {
  for (auto *out : node->outputs) {
    if (out->IsOp() && out->Op()->Type() == op_type) {
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -28,7 +28,6 @@
 #include <utility>
 #include <vector>

-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@ -146,6 +145,11 @@ struct PDNode {
      const std::unordered_set<std::string>& op_types,
      const std::string& argument, int nth);

+  PDNode* assert_is_only_input_of_ops(
+      const std::unordered_set<std::string>& op_types);
+  PDNode* assert_is_only_output_of_ops(
+      const std::unordered_set<std::string>& op_types);
+
  PDNode* assert_has_n_inputs(size_t n);
  PDNode* assert_has_n_outputs(size_t n);

--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@ -14,7 +14,6 @@

 #include <vector>

-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
@ -17,7 +17,6 @@
 #include <vector>

 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_test_util.h"
 #include "paddle/fluid/framework/naive_executor.h"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
wuhuanzhou	587d99ae44	update compilation with C++14 (#31815 ) * update compilation with C++14, test=develop * fix compilation error in eigen, test=develop	5 years ago
tianshuo78520a	b09c1ce09a	fix whl package push pypi (#31585 ) * fix whl package push pypi * add rst	5 years ago
Thunderbrook	393b3bd6b7	fix split core (#31892 ) * fix split core * format	5 years ago
wuhuanzhou	3a95a0bc26	update cmake minimum version to 3.15 (#31807 ) * update cmake minimum version to 3.15, test=develop * fix compilation error on Windows, test=develop * fix compilation error on Windows, test=develop * fix compilation error on Windows, test=develop	5 years ago
taixiurong	52b05baca3	fix some bug in transformer training in xpu (#31918 )	5 years ago
Wenyu	5394194e3a	support minus-int idx to LayerList (#31750 ) * support minus-int idx to LayerList * update layerlist test	5 years ago
furnace	ef8323d49e	[ROCM] Add ROCm support for warpctc op (#31817 ) * bugfix for warpctc * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix WARPCTC_WITH_HIP invalid * Add logs to find out why can not dlopen libwarpctc.so * fix warpctc commit id * fix unit test test_warpctc_op * Optime failed log for dlopen * Optime failed log for dlopen * Delete extra changes * fix warpctc commit id * fix warpctc commit id * Add is_compiled_with_rocm for test_warpctc_op * fix warpctc commit id * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * fix code style problems	5 years ago
Jiawei Wang	95f808c878	fix stack op grad nullptr (#31962 )	5 years ago
liym27	57d4288ad4	[dynamic setitem] Fix bug of dynamic setitem: Decerease axes to do right broadcast (#31960 )	5 years ago
石晓伟	0fa6c8a35c	fix a syntax error, test=develop (#31930 )	5 years ago
Pei Yang	98e803e04f	map_matmul_to_mul_pass support 3dim (#31958 )	5 years ago
wuhuanzhou	a37a7f67e1	modify CI recommend information (#31395 )	5 years ago
jakpiase	6dca7a1de7	Added int8 kernel for oneDNN LSTM op (#31894 )	5 years ago
Pei Yang	14b7e3cf06	[Paddle-TRT] TRT inference support for BERT/Transformer in paddle 2.0 api (#31744 ) * support multihead_matmul_fuse_pass_v3 * fix compile problems * embedding_eltwise_ln pass support lookup_table_v2 * suppoort matmul and matmul_v2 in qkv matmul	5 years ago
Zhou Wei	245252b86e	fix bug when dtype of to_tensor is core.VarType (#31931 )	5 years ago
Zhen Wang	e1f931610e	Fix save/load error in imperative qat UT. (#31937 )	5 years ago
Yiqun Liu	e50bc2c2a6	Enhance cmake to support specifying CUDA_ARCH_NAME to Ampere. (#31923 )	5 years ago
Zhou Wei	04a49b097e	[Custom OP]Remove old custom OP and reduce whl package volume (#31813 ) * Remove old custom OP to reduce whl package volume * [Custom OP]Remove old custom OP to reduce whl package volume	5 years ago
wangguanzhong	fe2848686b	add exclusive for test_conv2d_op, test=develop (#31936 )	5 years ago
chajchaj	73a6fa3ed0	add deprecated for softmax_with_cross_entropy (#31722 ) * add deprecated for softmax_with_cross_entropy, test=develop * test for deprecated in english doc, test=develop * test deprecated for softmax_with_cross_entropy in english doc, test=develop * fix readme and English doc for cross_entropy, test=develop * rm test for softmax_with_cross_entropy deprecated, test=develop * update readme for CrossEntropyLoss, test=develop * fix readme format, test=develop * fix readme format, test=develop * fix readme format for cross_entropy, test=develop * add softmax_switch and fix softlabel for cross_entropy, test=develop * 1)recovery softmax_with_cross_entropy in fluid 2) change softmax_switch to use_softmax 3) add example for softlabel for cross_entropy, test=develop * fix Example number for cross_entropy, test=develop * fix code format, test=develop * fix for CI-Coverage, test=develop * fix for CI-Coverage, test=develop * fix ci-coverage for Non-ASCII character '\xe2' in file, test=develop * fix ci-coverage for Non-ASCII character '\xe2' in nn.layer.loss.py, test=develop * update description for doc when use_softmax=Fasle, test=develop * fix some docs and code example for cross_entropy, test=develop * delete redundant description for soft_label parameter of cross_entropy, test=develop * fix some comment for test_cross_entropy_loss.py, test=develop	5 years ago
Shang Zhizhou	8084b7594b	fix batchnorm when inpu dims < 3 (#31933 ) * fix batchnorm when inpu dims < 3 * add unittest for batchnorm dims = 2	5 years ago
zlsh80826	64ee255ffd	[Paddle-TRT] yolobox (#31755 ) * yolobox converter and plugin * yolobox unittest * add dynamic shape restriction * fix git merge log	5 years ago
Aurelius84	c4b60efabd	Fix segment Fault from set_value (#31891 ) * Avoid raising warning while import paddle * fix segment fault of set_value * fix code style	5 years ago
wuhuanzhou	17030ff28b	fix op benchmark ci error caused by missing test_pr branch, test=document_fix (#31920 )	5 years ago
niuliling123	a71d72d921	relu forward and backward with vectortype (#31869 )	5 years ago
tianshuo78520a	8829a309fe	Delete cudnn6 code (#31835 )	5 years ago
wanghuancoder	b48841ba2e	modify API nn.Bilinear's doc (#31889 ) * modify API nn.Bilinear's doc, test=develop * modify API nn.Bilinear's doc, test=develop	5 years ago
liym27	525c32e33c	Fix bug of set_value op：Decerease axes to do right broadcast (#31875 )	5 years ago
ronnywang	123949eb48	[ROCM] added a cudnn switch of conv2d for rocm platform (#31836 )	5 years ago
Shang Zhizhou	61805d8f0a	fix cmake model path (#31866 ) * fix cmake model path * update cmake * fix unittest * fix unittest	5 years ago
Jiabin Yang	51eb29de18	[CustomOP] Add shape related constructor for Tensor (#31681 ) * give shape related contructor and reshape warning * change line num to fit ut * change ut to fit * remove useless code * call resize directly in constructor	5 years ago
zlsh80826	e3a38d790a	[Paddle-TRT] roi_align_plugin (#31732 ) * add roi_align_plugin * add roi align unit_test * add roi align serialization * remove roi align static plugin because of batch dim issue * refine roi align unittest and add fp16/serialization * add trt roi align condition to op_teller * refine error message * remove unnecessary reshape layer	5 years ago
zlsh80826	bfb5cf5567	[Paddle-TRT] trt affine channel converter (#31628 ) * trt affine channel converter * add trt affine channel base test * add trt affine channel NHWC * remove asterisk for python2 compatibility * trt affine channel converter * add trt affine channel base test * add trt affine channel NHWC * remove asterisk for python2 compatibility * fix rebase * move LodTensor to Tensor * add dbg info * affine channel converter only support NCHW * scale,bias are parameters, use create_parameters api * reduce test input size to not exceed the timelimit of ci * refine affine channel unittest and add serialization/dynamic test * change super to InferencePassTest for python2 compatibility * change super to InferencePassTest for python2 compatibility * fix affine channel fp16 serialize setting	5 years ago
cc	b47478efc2	[dygraph qat] Use layer to calculate output scale (#31861 ) * Use layer to calculate output scale * add backward for moving_average_abs_max_scale and save output scales to op's attr	5 years ago
lilong12	c3974d0e2a	[3D-parallel] Reformat pipeline parallel (#31786 ) * update, test=develop	5 years ago
zlsh80826	01aa252624	[Paddle-TRT] multiclass nms (#31742 ) * add multiclass_nms * add multiclass_nms unittest * add default enable_tensorrt_oss option * refine multiclas nms unittest and add serialization/dynamic test * change super to InferencePassTest for python2 compatibility * refine multiclass nms unittest * move out dynamic shape test due to ci timelimit	5 years ago
Wilber	70b67f1029	fix go api bug. (#31857 )	5 years ago
tianshuo78520a	e804f08559	delete include framework.pb.h (#31859 ) * delete include framework.pb.h * fix error	5 years ago
Chengmo	f58cb01864	【Paddle.Fleet】fix dataset zip py3 bug (#31441 ) * fix zip py3 bug	5 years ago
Kaipeng Deng	bf09dcb346	add GPU tensor notice & update default_collate_fn/default_convert_fn. test=develop (#31763 )	5 years ago
Chen Weihang	27f2d8df8e	Polish two error messages (#31852 ) * polish two error messages * polish details	5 years ago
Zhou Wei	511e204e62	LRScheduler.get_lr should not update lr in LinearWarmup (#31843 )	5 years ago
niuliling123	6472d62093	Revert "add relu forward kernel and backward kernel (#31613 )" (#31853 )	5 years ago
winter-wang	e7f28d6c0d	fix runtime crash when rnn model inference, test=develop (#31833 )	5 years ago
parap1uie-s	5d89ec36dc	Update pooling.py (#31829 ) Fix default argument of nn.MaxPool3D()	5 years ago
Huihuang Zheng	649868ffb2	[Dy2stat] Fix the bug that loop_body_func may return single element (#31806 ) Our old `loop_body` function may return single element when `loop_vars` just contains only 1 element, which can cause bug. The key point of this PR is forcing `loop_body` functions always return tuple.	5 years ago
Wojciech Uss	e5f7a834d4	fix cache key in concat oneDNN kernel (#31820 ) * fix cache key in concat oneDNN kernel * key simplified	5 years ago
Aurelius84	f2cfc0f46d	[CustomOp]Avoid raising warning while import paddle (#31804 )	5 years ago
cc	84a551380e	[dygraph qat] Refine saving output scale to infer program (#31784 ) * Refine saving output scale to infer program	5 years ago
Chen Weihang	68497e7b39	change trainable to stop_gradient in optimizer (#31823 )	5 years ago
ronnywang	270699e647	[ROCM] fix test_matmul_v2_op (#31802 )	5 years ago
Zhou Wei	1eb927f935	Restore the third-party library cache for windows (#31811 )	5 years ago