【NPU】fix bug of using temp vector (#31963 )

[NPU] support npu for memcpy op (#31808 )
* support npu for memcpy op * add ut * fix ut * fix typo
306 changed files with 28247 additions and 763 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -31,9 +31,17 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
+option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
 if (WITH_GPU  AND WITH_XPU)
    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
+if (WITH_GPU AND WITH_ASCEND)
+    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
@ -57,6 +65,10 @@ if(WITH_MUSL)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()

+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
+
 if(WIN32)
    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -78,6 +78,14 @@ if(WITH_BOX_PS)
    add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()

+if(WITH_ASCEND)
+    add_definitions(-DPADDLE_WITH_ASCEND)
+endif()
+
+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
    message(STATUS "Compile with XPU!")
    add_definitions(-DPADDLE_WITH_XPU)
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#NOTE: Logic is from
+# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
+if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+else()
+    set(ASCEND_DIR /usr/local/Ascend)
+endif()
+
+if(WITH_ASCEND)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
+    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+  endif()
+
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+endif()
+
+if(WITH_ASCEND_CL)
+  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
+  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
+  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
+  set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+
+  message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
+  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
+  INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
+
+  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
+  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
+  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
+
+endif()
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@ -53,6 +53,7 @@ ExternalProject_Add(
    "${CRYPTOPP_DOWNLOAD_CMD}"
    PREFIX          ${CRYPTOPP_PREFIX_DIR}
    SOURCE_DIR      ${CRYPTOPP_SOURCE_DIR}
+    UPDATE_COMMAND  ""
    PATCH_COMMAND
    COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
    COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@ -42,7 +42,7 @@ ExternalProject_Add(
    UPDATE_COMMAND        ""
    CONFIGURE_COMMAND     ""
    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+        && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -198,8 +198,19 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
    ENDIF()

-    SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-    SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+else()
+    SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG v3.8.0)
+    
+    # SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+    # SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+endif()

    cache_third_party(${TARGET_NAME}
        REPOSITORY    ${PROTOBUF_REPOSITORY}
@ -234,7 +245,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()

-SET(PROTOBUF_VERSION 3.1.0)
+SET(PROTOBUF_VERSION 3.8.0)

 IF(NOT PROTOBUF_FOUND)
    build_protobuf(extern_protobuf FALSE)
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@ -16,7 +16,7 @@ INCLUDE(ExternalProject)

 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)

 cache_third_party(extern_threadpool
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -17,8 +17,9 @@ INCLUDE(ExternalProject)
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
+set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
+set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@ -52,7 +53,7 @@ ExternalProject_Add(
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+		            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -151,6 +151,8 @@ set(COMMON_FLAGS
    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
+    -Wno-error=nonnull-compare # Warning in boost gcc 7.2
+    -Wno-error=address # Warning in boost gcc 7.2
    ${fsanitize}
 )

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -440,9 +440,20 @@ function(cc_test TARGET_NAME)
    cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"        OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
  endif()
 endfunction(cc_test)

@ -859,7 +870,7 @@ function(py_test TARGET_NAME)
               ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    endif()
-    
+
    if (WIN32)
        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
    endif()
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -11,12 +11,16 @@ function(op_library TARGET)
    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
    set(xpu_cc_srcs)
+    set(npu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(cudnn_cu_srcs)
    set(CUDNN_FILE)
    set(mkldnn_cc_srcs)
    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
+    if (WITH_ASCEND_CL)
+      set(op_common_deps ${op_common_deps} npu_op_runner)
+    endif()
    # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
    set(options UNITY)
    set(oneValueArgs "")
@ -84,6 +88,12 @@ function(op_library TARGET)
                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
            endif()
        endif()
+        if(WITH_ASCEND_CL)
+            string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
+                list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$")
@ -106,6 +116,8 @@ function(op_library TARGET)
                list(APPEND cu_cc_srcs ${src})
            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
+                list(APPEND npu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
@ -170,7 +182,7 @@ function(op_library TARGET)
        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
        if(WITH_UNITY_BUILD AND op_library_UNITY)
            # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
+            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
            if(TARGET ${UNITY_TARGET})
                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@ -181,7 +193,7 @@ function(op_library TARGET)
            # Add alias library to handle dependencies.
            add_library(${TARGET} ALIAS ${UNITY_TARGET})
        else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
+            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
        endif()
    endif()
@ -201,6 +213,7 @@ function(op_library TARGET)
    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    set(ORIGINAL_TARGET ${TARGET})
    file(READ ${TARGET}.cc TARGET_CONTENT)
    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
    # [ \t\r\n]* is used for blank characters
@ -230,10 +243,11 @@ function(op_library TARGET)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    list(LENGTH npu_cc_srcs npu_cc_srcs_len)
    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@ -273,6 +287,26 @@ function(op_library TARGET)
    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
    endif()
+
+    if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
+        file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
+        # It is different from the logic above, becareful
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
+        # [ \t\r\n]* is used for blank characters
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
+
+        if (one_npu_register STREQUAL "")
+            string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
+        else ()
+            string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
+            string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
+            # [ \t\r\n]+ is used for blank characters.
+            # Here we use '+' instead of '*' since it is a REPLACE operation.
+            string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
+        endif()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
+    endif()
+
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
      # Append first implemented MKLDNN activation operator
@ -323,6 +357,7 @@ function(register_operators)
    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
    string(REPLACE "_mkldnn" "" OPS "${OPS}")
    string(REPLACE "_xpu" "" OPS "${OPS}")
+    string(REPLACE "_npu" "" OPS "${OPS}")
    string(REPLACE ".cc" "" OPS "${OPS}")
    list(REMOVE_DUPLICATES OPS)
    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@ -274,6 +274,16 @@ if(WITH_BOX_PS)
    list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)

+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    include(external/ascend)
+    if(WITH_ASCEND)
+        list(APPEND third_party_deps extern_ascend)
+    endif()
+    if(WITH_ASCEND_CL)
+        list(APPEND third_party_deps extern_ascend_cl)
+    endif()
+endif ()
+
 if (WITH_PSCORE)
    include(external/snappy)
    list(APPEND third_party_deps extern_snappy)
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@ -28,6 +28,7 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/heter_service.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -451,7 +452,7 @@ class HeterBoxWorker : public HogwildWorker {
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
-  virtual void ProduceTasks() override;
+  void ProduceTasks() override;
  virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
  virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
  virtual void TrainFilesWithProfiler() {}
@ -550,7 +551,7 @@ class PSGPUWorker : public HogwildWorker {
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
-  virtual void ProduceTasks() override;
+  void ProduceTasks() override;
  virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
  virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
  virtual void TrainFilesWithProfiler() {}
@ -633,7 +634,7 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif

-#if defined(PADDLE_WITH_NCCL)
+#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL)
 class SectionWorker : public DeviceWorker {
 public:
  SectionWorker() {}
@ -654,6 +655,9 @@ class SectionWorker : public DeviceWorker {
  void SetDeviceIndex(int tid) override {}
  void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
  void SetMicrobatchNum(int num) { num_microbatches_ = num; }
+  void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; }
+  void SetPipelineStage(int stage) { pipeline_stage_ = stage; }
+  void SetScheduleMode(int mode) { schedule_mode_ = mode; }
  void SetMicrobatchScopes(const std::vector<Scope*>& scope) {
    microbatch_scopes_ = scope;
  }
@ -661,11 +665,23 @@ class SectionWorker : public DeviceWorker {
  void SetSkipVars(const std::vector<std::string>& skip_vars) {
    skip_vars_ = skip_vars;
  }
+  void RunBackward(
+      int micro_id, std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
+  void RunForward(
+      int micro_id, std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
+  void RunUpdate(
+      std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);

 protected:
  int section_id_;
  int thread_id_;
  int num_microbatches_;
+  int num_pipeline_stages_;
+  int pipeline_stage_;
+  int schedule_mode_;  // 0 for GPipe and 1 for deepspeed
  std::vector<Scope*> microbatch_scopes_;
  std::vector<std::string> skip_vars_;
  const Scope* minibatch_scope_;
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@ -76,7 +76,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif

-#if defined(PADDLE_WITH_NCCL)
+#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@ -32,6 +32,14 @@ message ShardingConfig {
  optional float fuse_broadcast_MB = 1 [ default = 32.0 ];
  optional bool hybrid_dp = 2 [ default = false ];
  optional int32 sharding_group_size = 3 [ default = 8 ];
+  optional bool as_outer_parallelism = 4 [ default = false ];
+  optional int32 parallelism = 5 [ default = 1 ];
+  optional bool use_pipeline = 6 [ default = false ];
+  optional int32 acc_steps = 7 [ default = 1 ];
+  optional int32 schedule_mode = 8 [ default = 0 ];
+  optional int32 pp_bz = 9 [ default = 1 ];
+  optional bool pp_allreduce_in_optimize = 10 [ default = false ];
+  optional bool optimize_offload = 11 [ default = false ];
 }

 message AMPConfig {
@ -44,6 +52,8 @@ message AMPConfig {
  repeated string custom_white_list = 7;
  repeated string custom_black_list = 8;
  repeated string custom_black_varnames = 9;
+  optional bool use_pure_fp16 = 10 [ default = false ];
+  optional bool use_fp16_guard = 11 [ default = true ];
 }

 message LocalSGDConfig {
@ -117,6 +127,8 @@ message AsyncConfig {

 message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }

+message ModelParallelConfig { optional int32 parallelism = 1 [ default = 1 ]; }
+
 message DistributedStrategy {
  // bool options
  optional Mode mode = 1 [ default = COLLECTIVE ];
@ -140,12 +152,13 @@ message DistributedStrategy {
  optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
  optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
  optional bool cudnn_exhaustive_search = 21 [ default = true ];
-  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
+  optional int32 conv_workspace_size_limit = 22 [ default = 512 ];
  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
  optional bool adaptive_localsgd = 24 [ default = false ];
  optional bool fp16_allreduce = 25 [ default = false ];
  optional bool sharding = 26 [ default = false ];
  optional float last_comm_group_size_MB = 27 [ default = 1 ];
+  optional bool model_parallel = 28 [ default = false ];

  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;
@ -158,6 +171,7 @@ message DistributedStrategy {
  optional LambConfig lamb_configs = 109;
  optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
  optional ShardingConfig sharding_configs = 111;
+  optional ModelParallelConfig model_parallel_configs = 112;
  optional BuildStrategy build_strategy = 201;
  optional ExecutionStrategy execution_strategy = 202;
 }
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
  }

+  inline ::DLContext operator()(const platform::NPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
+  }
+
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
    ::DLContext ctx;
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -466,6 +466,25 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_npu_place(place_)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      if (IsFastEagerDeletionModeEnabled()) {
+        VLOG(4) << "Use unsafe fast gc for NPU.";
+        gc.reset(new NPUUnsafeFastGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Please set FLAGS_fast_eager_deletion_mode=true to use "
+            "GarbageCollector on NPU."));
+        // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
+        VLOG(4) << "Use default stream gc for NPU.";
+        gc.reset(new NPUDefaultStreamGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      }
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
 #endif
    }
  }
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@ -31,3 +31,7 @@ endif(WITH_GLOO)
 cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)

 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
+
+if(WITH_ASCEND)
+    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
+endif(WITH_ASCEND)
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
@ -0,0 +1,22 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+namespace paddle {
+namespace framework {
+std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@ -0,0 +1,208 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND
+#include <glog/logging.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
+
+#include "ge/ge_api.h"
+#include "ge/ge_api_types.h"
+#include "graph/attr_value.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+
+namespace paddle {
+namespace framework {
+
+typedef ge::Graph AscendGraphDesc;
+
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = ge::AscendString;
+#else
+using AscendString = std::string;
+#endif
+
+class AscendInstance {
+ public:
+  virtual ~AscendInstance() {}
+  AscendInstance() {}
+
+  std::map<AscendString, AscendString> _GetDefaultInitOptions() {
+    std::map<AscendString, AscendString> init_options;
+    init_options["ge.exec.deviceId"] = "0";
+    init_options["ge.graphRunMode"] = "1";
+    return init_options;
+  }
+
+  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
+    std::map<AscendString, AscendString> init_options;
+    // init_options["a"] = "b";
+    // init_options["ge.trainFlag"] = "1";
+    return init_options;
+  }
+
+  ge::Status InitGEForUT() {
+    return ge::GEInitialize(_GetDefaultInitOptions());
+  }
+
+  void InitGlobalResouces() {
+    LOG(INFO) << "Begin ascend InitGlobalResouces";
+    session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
+    if (session_ == nullptr) {
+      LOG(FATAL) << "new session error:" << session_;
+    }
+    LOG(INFO) << "End ascend InitGlobalResouces";
+  }
+
+  void DestroyGlobalResouces() {
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
+    session_ = nullptr;
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
+  }
+
+  static std::shared_ptr<AscendInstance> GetInstance() {
+    if (nullptr == ascend_instance_) {
+      ascend_instance_.reset(new paddle::framework::AscendInstance());
+      VLOG(1) << "Initialize AscendInstance Done";
+    }
+    return ascend_instance_;
+  }
+
+  void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
+    ge::Status status = session_->AddGraph(graph_idx, graph);
+    PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Calling addGraph of graph engine failed, please "
+                          "check Ascend Log."));
+    VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
+  }
+
+  ge::DataType VarTypeToGeType(proto::VarType::Type type) {
+    if (type == proto::VarType::FP16) {
+      return ge::DataType::DT_FLOAT16;
+    } else if (type == proto::VarType::FP32) {
+      return ge::DataType::DT_FLOAT;
+    } else if (type == proto::VarType::FP64) {
+      return ge::DataType::DT_DOUBLE;
+    } else if (type == proto::VarType::INT32) {
+      return ge::DataType::DT_INT32;
+    } else if (type == proto::VarType::INT64) {
+      return ge::DataType::DT_INT64;
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Not support %s as tensor type.", DataTypeToString(type)));
+    }
+  }
+  int GeTypeSize(proto::VarType::Type type) {
+    if (type == proto::VarType::FP16) {
+      return 2;
+    } else if (type == proto::VarType::FP32) {
+      return 4;
+    } else if (type == proto::VarType::FP64) {
+      return 8;
+    } else if (type == proto::VarType::INT32) {
+      return 4;
+    } else if (type == proto::VarType::INT64) {
+      return 8;
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Not support %s as tensor type.", DataTypeToString(type)));
+    }
+  }
+  ge::Tensor ConvertToGeTensor(const Tensor *tensor) {
+    auto numel = tensor->numel();
+    std::vector<int64_t> vec_dim;
+    auto dimen = arity(tensor->dims());
+    for (auto i = 0; i < dimen; ++i) {
+      vec_dim.push_back(tensor->dims()[i]);
+    }
+    // For Debug
+    // VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
+    // ", and shape is";
+    // for (const auto e : vec_dim) {
+    //   VLOG(0) << e;
+    // }
+
+    ge::Shape shape(vec_dim);
+    ge::TensorDesc tensor_desc(shape, ge::Format::FORMAT_ND,
+                               VarTypeToGeType(tensor->type()));
+    tensor_desc.SetRealDimCnt(vec_dim.size());
+
+    const uint8_t *data =
+        reinterpret_cast<const uint8_t *>(tensor->data<void>());
+    std::vector<uint8_t> dst(numel * GeTypeSize(tensor->type()));
+    memcpy(dst.data(), data, GeTypeSize(tensor->type()) * numel);
+    ge::Tensor ge_tensor(tensor_desc, dst);
+    return ge_tensor;
+  }
+
+  void RunAscendSubgraph(int graph_idx,
+                         const std::vector<const Tensor *> &inputs,
+                         std::vector<Tensor *> *outputs) {
+    VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
+    // Convert paddle Tensor to GE Tensor
+    std::vector<ge::Tensor> ge_inputs;
+    for (const auto &e : inputs) {
+      ge_inputs.push_back(ConvertToGeTensor(e));
+    }
+
+    // Run Graph
+    std::vector<ge::Tensor> ge_outputs;
+    ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
+    PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Calling RunGraph of graph engine failed, please "
+                          "check Ascend Log."));
+    VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
+
+    // change tensor back, note all tensor's type computed in GE is uint8
+    for (size_t i = 0; i < ge_outputs.size(); ++i) {
+      const uint8_t *ret_data = ge_outputs[i].GetData();
+      size_t size = ge_outputs[i].GetSize();
+      VLOG(1) << "GE Tensor size of the " << i << "th output var is " << size;
+      auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
+                                                       platform::CPUPlace());
+      memcpy(dst, ret_data, size);
+
+      // Following for debug:
+      // VLOG(0) << "output for " << i << " var: ";
+      // float *tmp = reinterpret_cast<float*>(dst);
+      // for (size_t j = 0; j < size / 4; ++j) {
+      //   printf("%f ", tmp[j]);
+      // }
+      // printf("\n");
+    }
+  }
+
+ protected:
+  std::shared_ptr<ge::Session> session_;
+
+ private:
+  static std::shared_ptr<AscendInstance> ascend_instance_;
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@ -89,7 +89,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
    : GarbageCollector(place, max_memory_size) {
  platform::CUDADeviceGuard guard(place.device);
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
-  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+  callback_manager_.reset(
+      new platform::StreamCallbackManager<cudaStream_t>(stream_));
 }

 StreamGarbageCollector::~StreamGarbageCollector() {
@ -118,6 +119,32 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUDefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void NPUDefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUUnsafeFastGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+#endif
+
 int64_t GetEagerDeletionThreshold() {
  return FLAGS_eager_delete_tensor_gb < 0
             ? -1
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector {

 private:
  cudaStream_t stream_;
-  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
+  std::unique_ptr<platform::StreamCallbackManager<cudaStream_t>>
+      callback_manager_;
 };

 class CUDAPinnedGarbageCollector : public GarbageCollector {
@ -130,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
+                                   size_t max_memory_size);
+
+  void Wait() const override;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class NPUUnsafeFastGarbageCollector : public GarbageCollector {
+ public:
+  NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
+                                size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
  Add(std::forward<Container>(objs), []() {});
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    return LibraryType::kPlain;
  } else if (s == std::string("XPU")) {
    return LibraryType::kPlain;
+  } else if (s == std::string("NPU")) {
+    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
    return LibraryType::kPlain;
  } else {
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)

+#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
@ -340,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)

+#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
 * Macro to mark what Operator and Kernel
 * we will use and tell the compiler to
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -212,6 +212,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
      platform::SetXPUDeviceId(dev_id);
+#endif
+    } else if (platform::is_npu_place(place)) {
+#ifndef PADDLE_WITH_ASCEND_CL
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with NPU support.",
+          place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+      platform::SetNPUDeviceId(dev_id);
 #endif
    }

@ -1265,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing NPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                    platform::errors::NotFound(
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Leo Chen	bc7a3afa68	【NPU】fix bug of using temp vector (#31963 )	4 years ago
Leo Chen	a6343afc70	[NPU] support npu for memcpy op (#31808 ) * support npu for memcpy op * add ut * fix ut * fix typo	4 years ago
An Improved PeleeNet Algorithm with Feature Pyramid Networks for Image Detection	3ab39705ea	adapter npu (#31926 ) Co-authored-by: baiyangfan <baiyangfan@baidu.com>	4 years ago
Leo Chen	ac89174e5a	[NPU] support GarbageCollector for npu (#31874 ) * support GarbageCollector for npu * fix typo * fix gather_grad * disable NPUDefaultStreamGarbageCollector on NPU	4 years ago
Leo Chen	3c66b8721a	[NPU] add npu kernel for truncated_gaussian_random op (#31654 ) * init * add todo * add npu kernel for truncated_gaussian_random * add sync * fix concat_grad * fix typo	4 years ago
liym27	4a823c5f63	[NPU] support fp16 of input for api pow (#31871 )	4 years ago
Leo Chen	f354e1d6d5	[NPU] fix some op bugs (#31855 ) * fix some op bugs * fix some bugs * follow comments * fix log level * add ut	4 years ago
zhang wenhui	9754d0a7cb	【NPU】Add int dtype kernel for reshape2 op (#31864 ) * fix * fix	4 years ago
Leo Chen	48aa92234d	[NPU] support npu for conditional_block op (#31854 )	4 years ago
Leo Chen	a93488839d	fix compile problem (#31850 )	4 years ago
pangyoki	0279486b02	【NPU】Support npu kernel for update_loss_scaling op (#31830 ) * add update_loss_scaling_npu NPU kernel * change TensorFromVec to Memset	4 years ago
pangyoki	b2407af6e3	[NPU] support mixed precision input for npu layer norm (#31847 ) * support mixed precision input for npu layer norm * fix layer_norm npu kernel Co-authored-by: zhiqiu <chenqiuliang@baidu.com>	4 years ago
Leo Chen	d1a4c53eee	[NPU] support default stream (#31510 )	4 years ago
Leo Chen	fead563156	[NPU] fix bug of lookup_table_v2_grad (#31834 )	4 years ago
liym27	149f76e636	[NPU] Support npu kernel for op elementwise_floordiv (#31822 )	4 years ago
Leo Chen	b3446670c1	[NPU] add npu kernel for concat op (#31695 ) * add npu kernel for concat op * add npu kernel for concat op * refine code * update * refine concat_grad	4 years ago
Leo Chen	03803f20fd	[NPU] support list of tensor input (#31801 ) * support list of tensor as npu input * add comment * fix typo * fix typo	4 years ago
Leo Chen	6350528220	[NPU] support fp16 for npu accuracy op (#31797 )	4 years ago
lw921014	c594f57685	add c_reduce_sum op (#31793 ) add c_reduce_sum op	4 years ago
lilong12	228bce12c8	Add 3d parallelism (#31796 ) Add 3d Parallelism Co-authored-by: WangXi <wangxi16@baidu.com> Co-authored-by: JZ-LIANG <jianzhongliang10@gmail.com> Co-authored-by: root <root@yq01-sys-hic-k8s-v100-box-a225-0562.yq01.baidu.com>	4 years ago
zhang wenhui	594bbcb189	【NPU】Fix reshape test & add grad test (#31776 ) * fix * fix	4 years ago
xiayanming	fba994c28b	[NPU] fix the grad kernel diff bug of gather op (#31757 ) * fix gather grad kernel diff * fix gather grad kernel diff * fix gather review bug	4 years ago
oyjxer	02912ce7f2	【NPU】Fix npu kernel elementwise_div_grad (#31753 )	4 years ago
Leo Chen	e6af7c0dd8	[NPU] fix some bugs of npu op (#31739 ) * fix softmax * fix mean * fix lookup_table_v2	4 years ago
OleNet	17862b725f	[NPU] Support mean npu kernel (#31729 )	4 years ago
Leo Chen	342252c902	[NPU] change transpose to transpose2 (#31734 ) * change transpose to transpose2 * fix bug	4 years ago
Void Main	7b450e7889	Add auto-increasing tag id for Hcom OPs (#31702 )	4 years ago
Leo Chen	50bc11621f	[NPU] fix reshape npu op kernel (#31726 ) * rename npu op file * fix reshape	4 years ago
liym27	c8729f2aec	[NPU] Remove redundant ctest of top_k_op_npu_test (#31718 )	4 years ago
An Improved PeleeNet Algorithm with Feature Pyramid Networks for Image Detection	f8e1f452c4	ascend_communicate (#31708 )	4 years ago
oyxuan-11	faf40da585	[NPU] Support NPU kernel of stack op (#31711 )	4 years ago
liym27	d55120d77f	[NPU] Support testing grad of NPU ops in OpTest (#31697 )	4 years ago
liym27	e424712073	[NPU] Fix bug: Fix calculation errors of pow grad npu kernel (#31699 )	4 years ago
OleNet	7ec8459c6c	[NPU] Support softmax npu kernel (#31564 )	4 years ago
Meiyim	7875bcb8f7	[NPU] npu support `transpose` (#31486 )	4 years ago
oyxuan-11	125201ee56	[NPU] Support NPU kernel sum op (#31671 )	4 years ago
OleNet	ef15544ee0	[NPU] add NPU add topk (#31596 ) * add topk op * add cmake * update topk npu op * refactor func * fix test not go npu TopKD bug * NPUPlace(4) to NPUPlace(0) * update comment Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
OleNet	743cc9b29b	[NPU] add Increment op (#31563 ) * add increment * fix * update test increment op inplace * update increment op * increment b = 2 Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
Leo Chen	1de6daff82	[NPU] fix shape of dx in mul_grad (#31675 ) * fix shape of dx * refine code	4 years ago
Meiyim	3dd992e24f	[NPU] Support npu op `expand` (#31405 ) * [npu] support npu kernel for `expand`	4 years ago
pangyoki	444c285202	【NPU】Add TensorCopy to NPU kernel for reduce_sum op (#31667 ) * update unittest * add TensorCopy in npu grad kernel	4 years ago
Leo Chen	8f08f160c6	Revert "[NPU] add npu kernel for mean Op (#31562 )" (#31665 ) This reverts commit `468ac6993b`.	4 years ago
OleNet	468ac6993b	[NPU] add npu kernel for mean Op (#31562 ) * update mean op * update mean op * give a better test activation Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
Leo Chen	5118968d80	[NPU] add npu kernel for softmax_with_cross_entropy (#31656 ) * init * fix bugs	4 years ago
zhang wenhui	925432d85e	【NPU】Support npu kernel for mul op (#31584 ) * add mul * add test mul	4 years ago
Leo Chen	1e956001ec	[NPU] add npu kernel for adam (#31644 ) * add npu kernel for adam * refine code * disable test * modify atol	4 years ago
pangyoki	795b0f92d3	【NPU】Support NPU kernel for reduce_sum op v2 (#31620 ) * add reduce_sum * fix broadcastd * fix test * fix * add unsqueeze in reduce_sum * add template * add unittest for keep_dim * test reduce_all Co-authored-by: frankwhzhang <frankwhzhang@126.com>	4 years ago
Leo Chen	b541ca8795	[NPU] add npu kernel for sgd (#31639 )	4 years ago
oyxuan-11	57220f594d	[NPU] Support NPU kernel cast op (#31635 ) Co-authored-by: frankwhzhang <frankwhzhang@126.com>	4 years ago
Leo Chen	3ca4bc1004	[NPU] fix allocator min chunk size (#31632 )	4 years ago
oyxuan-11	11f788771c	[NPU] Support npu kernel scatter op (#31624 ) * Support npu kernel scatter op * Add more test	4 years ago
Meiyim	e3e15792a4	[NPU] support npu kernel for `less_than` (#31327 ) * [npu] support npu kernel for `less than` * remove int* kernel * cleanup	4 years ago
Meiyim	a3cc4a4a69	[NPU] Support npu op table_lookup_v2 and table_lookup_v2_grad (#31399 ) * [npu] support npu kernel `table_lookup_v2` * clean up * +python test * +cmake * clean up * remove int8 kernel + python unitest for fp16 * clean up	4 years ago
oyjxer	f250416029	[NPU] Support npu op elementwise_pow (#31576 )	4 years ago
oyjxer	7241bc2210	[NPU] Support npu op elementwise_min (#31575 )	4 years ago
oyjxer	9606a86b18	[NPU] Support npu op logicalnot_op (#31534 )	4 years ago
oyjxer	47860ce20d	[NPU] Support npu op log, log_grad, sqrt, sqrt_grad, square, tanh and tanh_grad (#31600 )	4 years ago
oyjxer	de65486c19	【NPU】Support npu op elementwise_div and elementwise_div_grad (#31573 ) * Support npu op elementwise_div and elementwise_div_grad * Support npu op elementwise_div and elementwise_div_grad * Support npu op elementwise_div and elementwise_div_grad	4 years ago
OleNet	ec2160a622	[NPU] add range op (#31560 ) * add range op * fix codestyle; call GetSize directly Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
Leo Chen	0234693040	fix gather_grad bug (#31607 )	4 years ago
Leo Chen	5e851bff42	[NPU] fix assgin cmake (#31595 )	4 years ago
oyjxer	382fc31f89	【NPU】Support npu op gelu and gelu_grad (#31530 ) * Support npu op gelu and gelu_grad * Support npu op gelu and gelu_grad	4 years ago
oyjxer	5d29a27c2e	[NPU] fix npu op elementwise_mul_grad (#31592 )	4 years ago
OleNet	09bf2cfc0e	[NPU] add Assign OP (#31561 ) * add assign op * add test assign npu test * dele if def Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
xiayanming	f1fdddfdc8	[NPU] Support npu kernel for c sync stream op (#31386 ) * sync stream npu op * add with_ascend_acl * update c++ unittest	4 years ago
yinhaofeng	e1c33a6d69	[NPU] accuracy op (#31492 ) * accuracy op * fix license * fix * add test and fix bug	4 years ago
xiayanming	3bf8a34c69	[NPU] Support npu kernel for amp_check_finite_and_unscale_npu op (#31457 ) * Support npu kernel for amp_check_finite_and_unscale_npu op * support EnforceNotMet exception * fix exception bug * modify python unittest * precommit * update c++ unittest * fix review * fix review	4 years ago
xiayanming	d746197398	[NPU] Support npu kernel for gather op fix bug (#31541 ) * add gather npu op * code review done * update python new line * precommit * fix review * del commit * update gather_grad * fix bug * fix bug	4 years ago
zhang wenhui	5d22e15b6e	【NPU】Suppert npu kernel for reshape2 op (#31524 ) * add reshape2 npu * add reshpe2	4 years ago
zhang wenhui	581e5460a0	【NPU】add relu op for npu (#31515 ) * add relu npu * fixed * fix	4 years ago
oyjxer	cfeeb4bc95	[NPU] Support npu op elementwise_max (#31574 )	4 years ago
oyjxer	e15ccafb84	[NPU] Support npu op elementwise_mul and elementwise_mul_grad (#31571 )	4 years ago
zhang wenhui	29d50d2049	【NPU】Support npu kernel for matmul op (#31544 ) * add matmulv2_npu * add matmul * add matmul	4 years ago
xiayanming	f400ce9f51	[NPU] Support npu kernel for reduceany op (#31422 ) * add reduce any npu op * add gather python unittest * update c_plus unittest * update python unittest * del c++ unittest * update c++ unittest * update c++ unittest	4 years ago
zhang wenhui	7524ac9345	【NPU】support npu kernel for fill_constant op (#31521 ) * add fill_constant npu * add fill_constant npu * fix	4 years ago
Leo Chen	3f206e97c4	Support TensorFormVector, TensorToVector of bool type (#31518 ) * support TensorFormVector, TensorToVector of bool type * add ut * fix compile problem	4 years ago
zhang wenhui	9df84bd693	【NPU】add scale op for npu (#31499 ) * add scale npu * fix * fix	4 years ago
xiayanming	e19195f795	Support npu kernel for gather op (#31458 ) * add gather npu op * code review done * update python new line * precommit * fix review * del commit	4 years ago
lw921014	15823bb0df	[NPU] add npu kernel for communication op (#31437 ) * add allreduce and broadcast without test * add c_broadcast_test case * build c_comm_init and c_create_group operators * make the whole thing compile * add broadcast and init op test case but run failed * make unit test compile * fix broadcast test bug and change into hcom for ccl * change c_comm_init and c_create_group ops accordingly * make tests compile * transfer code to 27 * compiled successfully in 28, but run failed * test broadcast in 28, but failed * make hcom primitives work * change hccl data type for base.h * fix broadcast bug * make attributes work * fix group name bug * add allreduce but test failed * allreduce bug for qiuliang * allreduce finished * add allgather and reducescatter * merge all op code * add allgather test * finish run all ccl op test exclude send/recv * all all op and test exclude send/recv * send_v2_npu.cc recv_v2_npiu.cc compiled * fix ccl core dump bug and test allgather, reducescatter, broadcast op * fix allreduce bug just for test * hcom send&recv test pass, without hcom_destroy * for qiuliang test * Ascend Send&Recv Test Pass * all op (ex send/recv) ok * fix bug * merge all ccl op * style merge to PaddlePaddle * merge style * new merge style * merge style 2 * insert an empty at the end * disable ctest for hcom to pass ci Co-authored-by: void-main <voidmain1313113@gmail.com> Co-authored-by: f2hkop <f2huestc@outlook.com>	4 years ago
Reventon_L	388c69f27d	[NPU] squeeze and unsqueeze op for ascend (#31452 ) Co-authored-by: root <xiayanming@baidu.com>	4 years ago
Leo Chen	c956c035dc	fix cmake of cryptopp to avoid downloading every time (#31451 )	4 years ago
Leo Chen	83f81eb573	Fix pow, refine code (#31440 )	4 years ago
Leo Chen	5fe3d596e4	Fix pow, use fillD instead of broadcast (#31433 )	4 years ago
zhang wenhui	ecc6e213d7	fix endif (#31431 )	4 years ago
zhang wenhui	b3c88e961c	[NPU] Support npu kernel for shape op (#31427 ) * add shape npu * fix * fix	4 years ago
Leo Chen	ac3d821bc0	[NPU] add npu kernel for equal op (#31393 ) * add npu kernel for equal op * refine code * add more ut * update year	4 years ago
Leo Chen	0310945f5c	[NPU] Support npu op layer_norm and layer_norm_grad (#31310 ) * init commit, add layer_norm npu kernel * fix typo * add unittest * add unittest * fix bug * fix bug * refine ut	4 years ago
Void Main	45765d6eb6	Refactor HCCLCommContext to be compatible with Paddle (#31359 ) Refactor HCCLCommContext to be compatible with Paddle (#31359)	4 years ago
Leo Chen	8497e2aad3	[NPU] add npu kernel for elementwise_add_grad (#31347 ) * fix reading flags from env * fix problem caused by async run * support partial grad * support elementwise_add_grad npu kernel * add unittest * fix bug?	4 years ago
lw921014	9fcdaeba5e	add allreduce and broadcast without test (#31024 ) add allreduce and broadcast without test	4 years ago
Leo Chen	5618f14047	fix reading flags from env (#31329 )	4 years ago
liym27	a1ddff81e3	[NPU] Support npu op: (1) slice (2) slice_grad (#31275 )	4 years ago
Leo Chen	d23bf89cf6	support list of list attribute for NPU (#31299 ) * support list of list attribute for NPU * fix compile problem * fix reference	4 years ago
liym27	77a0c41cb2	Fix pow npu fp16 test (#31256 )	4 years ago
liym27	187248f568	[NPU] Support npu op pow and pow grad (#31247 ) * [NPU] Support npu op: (1) pow (2) pow_grad * Support fp16	4 years ago
xiayanming	821c2f4ef8	add ascend unittest (#31249 ) add ascend unittest	4 years ago
Leo Chen	d45f5d787e	Fix typo of selected_npus (#31230 )	4 years ago
xiayanming	387c1db4f1	Ascendrc (#31065 ) Ascendrc	4 years ago
Leo Chen	ff4654e216	refactor npu device manager (#31154 ) refactor npu device manager (#31154)	4 years ago
liym27	1435b4c096	[NPU] Support executor with NPU (#31057 ) * [NPU] Support executor with NPU * Fix code according to reviews * Fix code * Add unittest for sub op npu	4 years ago
Leo Chen	678a3e8fed	support adding correct npu op in pybind.h (#31143 ) * support adding correct npu op in pybind.h * refine code	4 years ago
Leo Chen	85cbd55648	Fix compilation problem (#31100 ) Fix compilation problem (#31100)	4 years ago
Leo Chen	5cb20f30fc	add npu kernel for elementwise_sub and elementwise_sub_grad (#30973 ) * add npu sub op * fix typo * rename test * fix bug * fix bug * add fp16 kernel * fix typo * support sub grad op * support elementwise_sub_grad op Co-authored-by: frankwhzhang <frankwhzhang@126.com>	4 years ago
gongweibao	c687edecd8	Fix reshape on GE graph. (#31084 ) Fix reshape on GE graph	4 years ago
xiayanming	a6edbc478b	support parsing ascend rank table file (#31000 ) support parsing ascend rank table file	4 years ago
Leo Chen	1201cd2ef2	[feature] support npu allocator, part 2 (#30972 ) * support npu allocator * add npu device context * fix some compile problem * fix some compile problem * add npu info * compile ok * fix include dir * support naive_best_fit_allocator * run ut ok, bug failed to exit * call aclrtResetDevice before exit * fix aclFinilize * add system allocatot test * add selected_gpus in gtest * add tensor_test for npu * support npu op, initial commit * add npu stream * add elementwise_add_op * compile ok * fix typo * fix elementwise_add_op_npu_test * support op run * test can run but failed * change aclopExecuteV2 to aclopCompileAndExecute	4 years ago
Leo Chen	7e049108c5	[feature] support npu operator (#30951 ) [feature] support npu operator	4 years ago
Leo Chen	81138239db	[feature] support npu allocator (#30840 ) [feature] support npu allocator	4 years ago
gongweibao	ebef6601d5	Destroy session first. (#30954 ) Destroy session first.	4 years ago
Leo Chen	500f28ec37	pass cxx_flags to gloo cmake (#30857 )	4 years ago
gongweibao	de42d19336	Add paddle ascend distribution training supported (#30796 ) Add paddle ascend distribution training supported	4 years ago
OleNet	ebb5d181e8	Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug (#30797 ) Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug	4 years ago
dingsiyu	4a26729540	Merge ascend_optimizer and ascend_parser. (#30776 ) Merge ascend_optimizer and ascend_parser.	4 years ago
gongweibao	636fefd9f8	code style (#30781 ) code style	4 years ago
Leo Chen	88dfd067bf	Dev/fix ascend string (#30749 ) Dev/fix ascend string	4 years ago
Leo Chen	6eabbc8076	fix compilation on ascend-20.1 (#30722 ) fix compilation on ascend-20.1	4 years ago
Void Main	904cc44349	[Feature] Build parser to support distributed training (#30658 ) [Feature] Build parser to support distributed training	4 years ago
gongweibao	5b77b259d8	cleanup (#30646 ) cleanup test_ascend_group.py	4 years ago
gongweibao	7158061a29	Add startup bash files of test_ascend_group. (#30645 ) Add startup bash files of test_ascend_group	4 years ago
gongweibao	e4287ca60b	Add Hccl program group (#30642 ) Add Hccl program group	4 years ago
gongweibao	f5aca8fbb4	Pass device_ids info from launch to trainer. (#30632 ) Pass device_ids info from launch to trainer	4 years ago
Void Main	d2404da768	Build praser for Hcom* operators (#30627 ) Build praser for Hcom* operators	4 years ago
gongweibao	f9c97dd728	Add distribution supported (#30578 ) Add distribution supported	4 years ago
gongweibao	1882f2ce2d	Fix compilcation on CANN20.1 and older (#30494 ) Fix compilcation on CANN20.1 and older	4 years ago
hutuxian	6dd52c5b25	Ascend rc (#30483 )	4 years ago