[ROCM] Add ROCm support for warpctc op (#31817 ) (#31971 )

* bugfix for warpctc * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix WARPCTC_WITH_HIP invalid * Add logs to find out why can not dlopen libwarpctc.so * fix warpctc commit id * fix unit test test_warpctc_op * Optime failed log for dlopen * Optime failed log for dlopen * Delete extra changes * fix warpctc commit id * fix warpctc commit id * Add is_compiled_with_rocm for test_warpctc_op * fix warpctc commit id * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * fix code style problems
[ROCM] added a cudnn switch of conv2d for rocm platform (#31836 ) (#31932 )
179 changed files with 2779 additions and 5993 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-cmake_minimum_required(VERSION 3.15)
-cmake_policy(VERSION 3.10)
+cmake_minimum_required(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@ -39,6 +38,11 @@ endif()
 if (WITH_GPU  AND WITH_ASCEND)
    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
+# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
+if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
+    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
+       "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
+endif()

 if(WITH_GPU AND NOT APPLE)
    enable_language(CUDA)
@ -57,6 +61,7 @@ if(WITH_MUSL)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()

+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zm1000 /fp:fast")

 if(WIN32)
    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -74,7 +74,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
  set(archs_name_default "Auto")
  list(APPEND archs_names "Auto")

@ -108,8 +108,6 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    set(cuda_arch_bin "75")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
-    set(cuda_arch_bin "80")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@ -208,11 +206,14 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")

-# Set C++14 support
+# Set C++11 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-set(CMAKE_CUDA_STANDARD 14)
+if (NOT WIN32) # windows msvc2015 support c++11 natively.
+    # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
+  set(CMAKE_CUDA_STANDARD 11)
+endif(NOT WIN32)

 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
            message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
        endif()
    endif()
 endmacro()
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
  elseif(WITH_SUNWAY)
      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
  else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_03_30.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE)
  endif()

  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 include(CheckTypeSize)

-function(CheckCompilerCXX14Flag)
+function(CheckCompilerCXX11Flag)
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
            message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
        endif()
@ -20,15 +20,23 @@ function(CheckCompilerCXX14Flag)
                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
            endif()
        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
            endif()
        endif()
    endif()
 endfunction()

-CheckCompilerCXX14Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+CheckCompilerCXX11Flag()
+if (WITH_GPU)
+    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
+       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    endif()
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
--- a/go/README_cn.md
+++ b/go/README_cn.md
@ -50,7 +50,6 @@ output_data := value.Interface().([][]float32)

 运行
 ```bash
-go mod init github.com/paddlepaddle
 export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
 go run ./demo/mobilenet.go
 ```
--- a/go/demo/mobilenet.go
+++ b/go/demo/mobilenet.go
@ -13,7 +13,7 @@
 // limitations under the License.
 package main

-import "github.com/paddlepaddle/paddle"
+import "../paddle"
 import "strings"
 import "io/ioutil"
 import "strconv"
--- a/go/paddle/common.go
+++ b/go/paddle/common.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include <paddle_c_api.h>
 import "C"
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <paddle_c_api.h>
--- a/go/paddle/predictor.go
+++ b/go/paddle/predictor.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include "paddle_c_api.h"
 import "C"
@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string {
 }

 func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetOutputNum())
+	names := make([]string, predictor.GetInputNum())
 	for i := 0; i < len(names); i++ {
 		names[i] = predictor.GetOutputName(i)
 	}
--- a/go/paddle/tensor.go
+++ b/go/paddle/tensor.go
@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <string.h>
@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va
 		value := reflect.Indirect(ptr)
 		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
 		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(0).Kind() {
+			switch value.Index(1).Kind() {
 			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
 				binary.Read(r, Endian(), value.Interface())
 				return
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@ -52,9 +52,6 @@ class PD_DLL_DECL Tensor {
  /// \brief Construct a Tensor on target Place for CustomOp.
  /// Generally it's only used for user to create Tensor.
  explicit Tensor(const PlaceType& place);
-  /// \brief Construct a Tensor on target Place with shape for CustomOp.
-  /// Generally it's only used for user to create Tensor.
-  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
  /// \brief Reset the shape of the tensor.
  /// Generally it's only used for the input tensor.
  /// Reshape must be called before calling
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@ -102,32 +102,13 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,

 void Tensor::reshape(const std::vector<int64_t> &shape) {
  GET_CASTED_TENSOR
-  auto new_dim = framework::make_ddim(shape);
-  if (tensor->numel() != framework::product(new_dim)) {
-    LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger "
-                    "or smaller"
-                 << "than original shape will not change your tensor's memory "
-                    "Please call"
-                 << "paddle::Tensor::mutable_data<T>() after to reallocate "
-                    "your tensor's size."
-                 << std::endl;
-  }
-  tensor->Resize(new_dim);
+  tensor->Resize(framework::make_ddim(shape));
 }

 Tensor::Tensor(const PlaceType &place)
    : tensor_(std::make_shared<framework::LoDTensor>()),
      place_(place),
      stream_(StreamWrapper()) {}
-
-Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
-    : tensor_(std::make_shared<framework::LoDTensor>()),
-      place_(place),
-      stream_(StreamWrapper()) {
-  GET_CASTED_TENSOR
-  tensor->Resize(framework::make_ddim(shape));
-}
-
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
  place_ = place;
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -360,11 +360,46 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot

 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})

+# Old custom op extension mechanism related, will be removed in 2.1.0
+cc_library(paddle_framework_shared
+    SHARED SRCS executor.cc operator.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
+    DEPS ${FLUID_FRAMEWORK_MODULES})
+get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework)
+target_link_libraries(paddle_framework_shared ${os_dependency_modules})
+
+if (LINUX)
+  set(FLUID_FRAMEWORK_SHARED_LIB
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so
+      CACHE INTERNAL "Fluid framework lib")
+endif()
+
+if (WIN32)
+  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR})
+  else()
+    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
+  endif()
+  set(FLUID_FRAMEWORK_IMPORT_LIB
+      ${paddle_framework_lib_path}/paddle_framework.lib
+      CACHE INTERNAL "Fluid framework lib")
+  set(FLUID_FRAMEWORK_SHARED_LIB
+      ${paddle_framework_lib_path}/paddle_framework.dll
+      CACHE INTERNAL "Fluid framework dll")
+endif()
+
+if(APPLE)
+  set(FLUID_FRAMEWORK_SHARED_LIB
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib
+      CACHE INTERNAL "Fluid framework lib")
+endif()
 if(WITH_TESTING AND TEST selected_rows_test)
  set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()

-##### 2.0 New custom op extension mechanism related #####
+# New custom op extension mechanism related

 # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
 set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@ -0,0 +1,53 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/c/c_api.h"
+
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+extern "C" {
+
+paddle::framework::OpInfoMap &PD_GetOpInfoMap() {
+  return paddle::framework::OpInfoMap::Instance();
+}
+
+void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) {
+  paddle::platform::DeviceContextPool::SetPool(pool);
+}
+
+std::vector<std::string> PD_GetGradOpDescStrs(
+    const paddle::framework::OpDesc &op_desc,
+    const std::unordered_set<std::string> &no_grad_set,
+    std::unordered_map<std::string, std::string> *grad_to_var,
+    const std::vector<paddle::framework::BlockDesc *> &grad_block) {
+  auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type());
+  std::vector<std::string> ret;
+  if (op_info.grad_op_maker_) {
+    auto grad_op_descs =
+        op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block);
+    size_t op_num = grad_op_descs.size();
+    ret.resize(op_num);
+    for (size_t i = 0; i < op_num; ++i) {
+      PADDLE_ENFORCE_EQ(
+          grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
+          paddle::platform::errors::Unavailable(
+              "Cannot serialize operator desc message."));
+    }
+  }
+  return ret;
+}
+
+}  // end extern "C"
--- a/paddle/fluid/framework/c/c_api.h
+++ b/paddle/fluid/framework/c/c_api.h
@ -0,0 +1,55 @@
+/* copyright (c) 2019 paddlepaddle authors. all rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+class OpInfoMap;
+}  // namespace framework
+namespace platform {
+class DeviceContextPool;
+}  // namespace platform
+}  // namespace paddle
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global OpInfo map.
+paddle::framework::OpInfoMap &PD_GetOpInfoMap();
+
+// C-API to init global DeviceContextPool from outside.
+void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool);
+
+// C-API to serialize the grad op protocol message to a binary string.
+std::vector<std::string> PD_GetGradOpDescStrs(
+    const paddle::framework::OpDesc &op_desc,
+    const std::unordered_set<std::string> &no_grad_set,
+    std::unordered_map<std::string, std::string> *grad_to_var,
+    const std::vector<paddle::framework::BlockDesc *> &grad_block);
+
+#ifdef __cplusplus
+}
+#endif
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@ -37,7 +37,7 @@ class CustomTensorUtils {
  /// \brief Share data FROM another tensor.
  /// Use this to pass tensor from op to op
  /// \return void.
-  static void ShareDataFrom(const void* src, const paddle::Tensor& dst);
+  static void ShareDataFrom(const void* src, const Tensor& dst);

  static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
      const paddle::DataType& dtype) {
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@ -18,6 +18,7 @@

 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@ -34,19 +34,15 @@ namespace patterns {
 static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name,
                               const std::string& arg,
                               bool is_persist = false) {
-  std::unordered_set<std::string> embedding_ops{"lookup_table",
-                                                "lookup_table_v2"};
  PDNode* node =
-      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
+      pattern->NewNode(name)->assert_is_op_input("lookup_table", arg);
  if (is_persist) return node->assert_is_persistable_var();
  return node;
 }
 static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name,
                                   const std::string& arg) {
-  std::unordered_set<std::string> embedding_ops{"lookup_table",
-                                                "lookup_table_v2"};
  PDNode* node = pattern->NewNode(name)
-                     ->assert_is_only_output_of_ops(embedding_ops)
+                     ->assert_is_only_output_of_op("lookup_table")
                     ->assert_is_op_input("elementwise_add", arg)
                     ->AsIntermediate();
  return node;
@ -60,12 +56,10 @@ void Embedding2Eltwise1Pattern::operator()() {
      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
  auto* lookup_table2_w =
      create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
-  std::unordered_set<std::string> embedding_ops{"lookup_table",
-                                                "lookup_table_v2"};
  auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
  auto* lookup_table2 =
-      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
+      pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table");
  auto* lookup_table1_out =
      create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
  auto* lookup_table2_out =
@ -86,10 +80,8 @@ void Embedding1Eltwise1Pattern::operator()() {
      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
  auto* lookup_table1_w =
      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
-  std::unordered_set<std::string> embedding_ops{"lookup_table",
-                                                "lookup_table_v2"};
  auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
  auto* lookup_table1_out =
      create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
  auto* eltwise_add =
@ -355,5 +347,4 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
    .AddCombination(
        paddle::framework::compatible::OpVersionComparatorCombination()
            .EQ("lookup_table", 0)
-            .LE("lookup_table_v2", 1)
            .EQ("elementweise_add", 0));
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -652,36 +652,6 @@ PDNode *PDNode::assert_is_ops_input(
  return this;
 }

-PDNode *PDNode::assert_is_only_input_of_ops(
-    const std::unordered_set<std::string> &op_types) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->outputs) {
-      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
-          op->inputs.size() == 1) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-PDNode *PDNode::assert_is_only_output_of_ops(
-    const std::unordered_set<std::string> &op_types) {
-  assert_is_var();
-  asserts_.emplace_back([=](Node *x) {
-    for (auto *op : x->inputs) {
-      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
-          op->outputs.size() == 1) {
-        return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
 bool VarLinksToOp(Node *node, const std::string &op_type) {
  for (auto *out : node->outputs) {
    if (out->IsOp() && out->Op()->Type() == op_type) {
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -28,6 +28,7 @@
 #include <utility>
 #include <vector>

+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@ -145,11 +146,6 @@ struct PDNode {
      const std::unordered_set<std::string>& op_types,
      const std::string& argument, int nth);

-  PDNode* assert_is_only_input_of_ops(
-      const std::unordered_set<std::string>& op_types);
-  PDNode* assert_is_only_output_of_ops(
-      const std::unordered_set<std::string>& op_types);
-
  PDNode* assert_has_n_inputs(size_t n);
  PDNode* assert_has_n_outputs(size_t n);

--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@ -14,6 +14,7 @@

 #include <vector>

+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
@ -17,6 +17,7 @@
 #include <vector>

 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_test_util.h"
 #include "paddle/fluid/framework/naive_executor.h"
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@ -57,7 +57,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
    std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
    size_t x_rank = x_shape.size();
    size_t y_rank = y_shape.size();
-    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
+    flag = flag && x_rank == 2 && y_rank == 2;

    std::vector<Node*>& next_ops = matmul_out->outputs;
    flag = flag && next_ops.size() == 1 &&
@ -69,7 +69,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
      desc.SetInput("X", {matmul_in_x->Name()});
      desc.SetInput("Y", {matmul_in_y->Name()});
      desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
+      desc.SetAttr("x_num_col_dims", 1);
      desc.SetAttr("y_num_col_dims", 1);
      if (matmul_op->Op()->HasAttr("enable_int8")) {
        desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
--- a/Show More
+++ b/Show More