Revert "[NPU] add npu kernel for mean Op (#31562 )"

This reverts commit 468ac6993b.
[NPU] add npu kernel for mean Op (#31562 )
236 changed files with 21348 additions and 142 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -31,9 +31,17 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
+option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
 if (WITH_GPU  AND WITH_XPU)
    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
+if (WITH_GPU AND WITH_ASCEND)
+    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
@ -57,6 +65,10 @@ if(WITH_MUSL)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()

+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
+
 if(WIN32)
    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -78,6 +78,14 @@ if(WITH_BOX_PS)
    add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()

+if(WITH_ASCEND)
+    add_definitions(-DPADDLE_WITH_ASCEND)
+endif()
+
+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
    message(STATUS "Compile with XPU!")
    add_definitions(-DPADDLE_WITH_XPU)
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#NOTE: Logic is from
+# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
+if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+else()
+    set(ASCEND_DIR /usr/local/Ascend)
+endif()
+
+if(WITH_ASCEND)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
+    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+  endif()
+
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+endif()
+
+if(WITH_ASCEND_CL)
+  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
+  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
+  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
+  set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+
+  message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
+  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
+  INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
+
+  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
+  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
+  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
+
+endif()
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@ -53,6 +53,7 @@ ExternalProject_Add(
    "${CRYPTOPP_DOWNLOAD_CMD}"
    PREFIX          ${CRYPTOPP_PREFIX_DIR}
    SOURCE_DIR      ${CRYPTOPP_SOURCE_DIR}
+    UPDATE_COMMAND  ""
    PATCH_COMMAND
    COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
    COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@ -42,7 +42,7 @@ ExternalProject_Add(
    UPDATE_COMMAND        ""
    CONFIGURE_COMMAND     ""
    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+        && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -198,8 +198,19 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
    ENDIF()

-    SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-    SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+else()
+    SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG v3.8.0)
+    
+    # SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+    # SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+endif()

    cache_third_party(${TARGET_NAME}
        REPOSITORY    ${PROTOBUF_REPOSITORY}
@ -234,7 +245,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()

-SET(PROTOBUF_VERSION 3.1.0)
+SET(PROTOBUF_VERSION 3.8.0)

 IF(NOT PROTOBUF_FOUND)
    build_protobuf(extern_protobuf FALSE)
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@ -16,7 +16,7 @@ INCLUDE(ExternalProject)

 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)

 cache_third_party(extern_threadpool
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -17,8 +17,9 @@ INCLUDE(ExternalProject)
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
+set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
+set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@ -52,7 +53,7 @@ ExternalProject_Add(
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+		            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -151,6 +151,8 @@ set(COMMON_FLAGS
    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
+    -Wno-error=nonnull-compare # Warning in boost gcc 7.2
+    -Wno-error=address # Warning in boost gcc 7.2
    ${fsanitize}
 )

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -440,9 +440,19 @@ function(cc_test TARGET_NAME)
    cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
  endif()
 endfunction(cc_test)

@ -859,7 +869,7 @@ function(py_test TARGET_NAME)
               ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    endif()
-    
+
    if (WIN32)
        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
    endif()
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -11,12 +11,16 @@ function(op_library TARGET)
    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
    set(xpu_cc_srcs)
+    set(npu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(cudnn_cu_srcs)
    set(CUDNN_FILE)
    set(mkldnn_cc_srcs)
    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
+    if (WITH_ASCEND_CL)
+      set(op_common_deps ${op_common_deps} npu_op_runner)
+    endif()
    # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
    set(options UNITY)
    set(oneValueArgs "")
@ -84,6 +88,12 @@ function(op_library TARGET)
                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
            endif()
        endif()
+        if(WITH_ASCEND_CL)
+            string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
+                list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$")
@ -106,6 +116,8 @@ function(op_library TARGET)
                list(APPEND cu_cc_srcs ${src})
            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
+                list(APPEND npu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
@ -170,7 +182,7 @@ function(op_library TARGET)
        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
        if(WITH_UNITY_BUILD AND op_library_UNITY)
            # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
+            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
            if(TARGET ${UNITY_TARGET})
                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@ -181,7 +193,7 @@ function(op_library TARGET)
            # Add alias library to handle dependencies.
            add_library(${TARGET} ALIAS ${UNITY_TARGET})
        else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
+            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
        endif()
    endif()
@ -201,6 +213,7 @@ function(op_library TARGET)
    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    set(ORIGINAL_TARGET ${TARGET})
    file(READ ${TARGET}.cc TARGET_CONTENT)
    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
    # [ \t\r\n]* is used for blank characters
@ -230,10 +243,11 @@ function(op_library TARGET)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    list(LENGTH npu_cc_srcs npu_cc_srcs_len)
    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@ -273,6 +287,26 @@ function(op_library TARGET)
    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
    endif()
+
+    if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
+        file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
+        # It is different from the logic above, becareful
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
+        # [ \t\r\n]* is used for blank characters
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
+
+        if (one_npu_register STREQUAL "")
+            string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
+        else ()
+            string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
+            string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
+            # [ \t\r\n]+ is used for blank characters.
+            # Here we use '+' instead of '*' since it is a REPLACE operation.
+            string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
+        endif()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
+    endif()
+
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
      # Append first implemented MKLDNN activation operator
@ -323,6 +357,7 @@ function(register_operators)
    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
    string(REPLACE "_mkldnn" "" OPS "${OPS}")
    string(REPLACE "_xpu" "" OPS "${OPS}")
+    string(REPLACE "_npu" "" OPS "${OPS}")
    string(REPLACE ".cc" "" OPS "${OPS}")
    list(REMOVE_DUPLICATES OPS)
    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@ -274,6 +274,16 @@ if(WITH_BOX_PS)
    list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)

+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    include(external/ascend)
+    if(WITH_ASCEND)
+        list(APPEND third_party_deps extern_ascend)
+    endif()
+    if(WITH_ASCEND_CL)
+        list(APPEND third_party_deps extern_ascend_cl)
+    endif()
+endif ()
+
 if (WITH_PSCORE)
    include(external/snappy)
    list(APPEND third_party_deps extern_snappy)
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
  }

+  inline ::DLContext operator()(const platform::NPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
+  }
+
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
    ::DLContext ctx;
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -466,6 +466,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_npu_place(place_)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      // TODO(ascendrc): Support garbage collector on NPUPlace
+      VLOG(4) << "Skip NPU gc because it is not implemented now.";
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "No NPU gc found in CPU/GPU/XPU paddle"));
 #endif
    }
  }
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@ -31,3 +31,7 @@ endif(WITH_GLOO)
 cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)

 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
+
+if(WITH_ASCEND)
+    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
+endif(WITH_ASCEND)
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
@ -0,0 +1,22 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+namespace paddle {
+namespace framework {
+std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@ -0,0 +1,208 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND
+#include <glog/logging.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
+
+#include "ge/ge_api.h"
+#include "ge/ge_api_types.h"
+#include "graph/attr_value.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+
+namespace paddle {
+namespace framework {
+
+typedef ge::Graph AscendGraphDesc;
+
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = ge::AscendString;
+#else
+using AscendString = std::string;
+#endif
+
+class AscendInstance {
+ public:
+  virtual ~AscendInstance() {}
+  AscendInstance() {}
+
+  std::map<AscendString, AscendString> _GetDefaultInitOptions() {
+    std::map<AscendString, AscendString> init_options;
+    init_options["ge.exec.deviceId"] = "0";
+    init_options["ge.graphRunMode"] = "1";
+    return init_options;
+  }
+
+  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
+    std::map<AscendString, AscendString> init_options;
+    // init_options["a"] = "b";
+    // init_options["ge.trainFlag"] = "1";
+    return init_options;
+  }
+
+  ge::Status InitGEForUT() {
+    return ge::GEInitialize(_GetDefaultInitOptions());
+  }
+
+  void InitGlobalResouces() {
+    LOG(INFO) << "Begin ascend InitGlobalResouces";
+    session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
+    if (session_ == nullptr) {
+      LOG(FATAL) << "new session error:" << session_;
+    }
+    LOG(INFO) << "End ascend InitGlobalResouces";
+  }
+
+  void DestroyGlobalResouces() {
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
+    session_ = nullptr;
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
+  }
+
+  static std::shared_ptr<AscendInstance> GetInstance() {
+    if (nullptr == ascend_instance_) {
+      ascend_instance_.reset(new paddle::framework::AscendInstance());
+      VLOG(1) << "Initialize AscendInstance Done";
+    }
+    return ascend_instance_;
+  }
+
+  void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
+    ge::Status status = session_->AddGraph(graph_idx, graph);
+    PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Calling addGraph of graph engine failed, please "
+                          "check Ascend Log."));
+    VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
+  }
+
+  ge::DataType VarTypeToGeType(proto::VarType::Type type) {
+    if (type == proto::VarType::FP16) {
+      return ge::DataType::DT_FLOAT16;
+    } else if (type == proto::VarType::FP32) {
+      return ge::DataType::DT_FLOAT;
+    } else if (type == proto::VarType::FP64) {
+      return ge::DataType::DT_DOUBLE;
+    } else if (type == proto::VarType::INT32) {
+      return ge::DataType::DT_INT32;
+    } else if (type == proto::VarType::INT64) {
+      return ge::DataType::DT_INT64;
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Not support %s as tensor type.", DataTypeToString(type)));
+    }
+  }
+  int GeTypeSize(proto::VarType::Type type) {
+    if (type == proto::VarType::FP16) {
+      return 2;
+    } else if (type == proto::VarType::FP32) {
+      return 4;
+    } else if (type == proto::VarType::FP64) {
+      return 8;
+    } else if (type == proto::VarType::INT32) {
+      return 4;
+    } else if (type == proto::VarType::INT64) {
+      return 8;
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Not support %s as tensor type.", DataTypeToString(type)));
+    }
+  }
+  ge::Tensor ConvertToGeTensor(const Tensor *tensor) {
+    auto numel = tensor->numel();
+    std::vector<int64_t> vec_dim;
+    auto dimen = arity(tensor->dims());
+    for (auto i = 0; i < dimen; ++i) {
+      vec_dim.push_back(tensor->dims()[i]);
+    }
+    // For Debug
+    // VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
+    // ", and shape is";
+    // for (const auto e : vec_dim) {
+    //   VLOG(0) << e;
+    // }
+
+    ge::Shape shape(vec_dim);
+    ge::TensorDesc tensor_desc(shape, ge::Format::FORMAT_ND,
+                               VarTypeToGeType(tensor->type()));
+    tensor_desc.SetRealDimCnt(vec_dim.size());
+
+    const uint8_t *data =
+        reinterpret_cast<const uint8_t *>(tensor->data<void>());
+    std::vector<uint8_t> dst(numel * GeTypeSize(tensor->type()));
+    memcpy(dst.data(), data, GeTypeSize(tensor->type()) * numel);
+    ge::Tensor ge_tensor(tensor_desc, dst);
+    return ge_tensor;
+  }
+
+  void RunAscendSubgraph(int graph_idx,
+                         const std::vector<const Tensor *> &inputs,
+                         std::vector<Tensor *> *outputs) {
+    VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
+    // Convert paddle Tensor to GE Tensor
+    std::vector<ge::Tensor> ge_inputs;
+    for (const auto &e : inputs) {
+      ge_inputs.push_back(ConvertToGeTensor(e));
+    }
+
+    // Run Graph
+    std::vector<ge::Tensor> ge_outputs;
+    ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
+    PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Calling RunGraph of graph engine failed, please "
+                          "check Ascend Log."));
+    VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
+
+    // change tensor back, note all tensor's type computed in GE is uint8
+    for (size_t i = 0; i < ge_outputs.size(); ++i) {
+      const uint8_t *ret_data = ge_outputs[i].GetData();
+      size_t size = ge_outputs[i].GetSize();
+      VLOG(1) << "GE Tensor size of the " << i << "th output var is " << size;
+      auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
+                                                       platform::CPUPlace());
+      memcpy(dst, ret_data, size);
+
+      // Following for debug:
+      // VLOG(0) << "output for " << i << " var: ";
+      // float *tmp = reinterpret_cast<float*>(dst);
+      // for (size_t j = 0; j < size / 4; ++j) {
+      //   printf("%f ", tmp[j]);
+      // }
+      // printf("\n");
+    }
+  }
+
+ protected:
+  std::shared_ptr<ge::Session> session_;
+
+ private:
+  static std::shared_ptr<AscendInstance> ascend_instance_;
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@ -89,7 +89,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
    : GarbageCollector(place, max_memory_size) {
  platform::CUDADeviceGuard guard(place.device);
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
-  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+  callback_manager_.reset(
+      new platform::StreamCallbackManager<cudaStream_t>(stream_));
 }

 StreamGarbageCollector::~StreamGarbageCollector() {
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector {

 private:
  cudaStream_t stream_;
-  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
+  std::unique_ptr<platform::StreamCallbackManager<cudaStream_t>>
+      callback_manager_;
 };

 class CUDAPinnedGarbageCollector : public GarbageCollector {
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    return LibraryType::kPlain;
  } else if (s == std::string("XPU")) {
    return LibraryType::kPlain;
+  } else if (s == std::string("NPU")) {
+    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
    return LibraryType::kPlain;
  } else {
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)

+#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -212,6 +212,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
      platform::SetXPUDeviceId(dev_id);
+#endif
+    } else if (platform::is_npu_place(place)) {
+#ifndef PADDLE_WITH_ASCEND_CL
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with NPU support.",
+          place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+      platform::SetNPUDeviceId(dev_id);
 #endif
    }

@ -1265,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing NPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                    platform::errors::NotFound(
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -614,6 +614,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                   const BuildStrategy &build_strategy,
                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "NPU is not supported in ParallelExecutor"));
  InitP2P(places);
  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                 member_->places_.size());
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@ -125,25 +125,54 @@ TEST(Tensor, MutableData) {
    float* p2 = nullptr;
    // initialization
    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
    auto p1_holder = src_tensor.Holder();
    EXPECT_NE(p1, nullptr);
    // set src_tensor a new dim with large size
    // momery is supposed to be re-allocated
    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
    auto p2_holder = src_tensor.Holder();
    EXPECT_NE(p2, nullptr);
    EXPECT_NE(p1_holder.get(), p2_holder.get());
    // set src_tensor a new dim with same size
    // momery block is supposed to be unchanged
    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
    EXPECT_EQ(p1, p2);
    // set src_tensor a new dim with smaller size
    // momery block is supposed to be unchanged
    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
+    EXPECT_EQ(p1, p2);
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::NPUPlace(0));
+    auto p1_holder = src_tensor.Holder();
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
+                                        platform::NPUPlace(0));
+    auto p2_holder = src_tensor.Holder();
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1_holder.get(), p2_holder.get());
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::NPUPlace(0));
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::NPUPlace(0));
    EXPECT_EQ(p1, p2);
  }
 #endif
@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) {
    framework::Tensor src_tensor;
    framework::Tensor dst_tensor;
    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
-                                 platform::CUDAPlace());
+                                 platform::CUDAPlace(0));
+    dst_tensor.ShareDataWith(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::NPUPlace(0));
    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
@ -216,7 +255,34 @@ TEST(Tensor, Slice) {
  {
    framework::Tensor src_tensor;
    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
-                                    platform::CUDAPlace());
+                                    platform::CUDAPlace(0));
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace(0)));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace(0)));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::NPUPlace(0));
    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
    framework::DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 2);
@ -227,12 +293,12 @@ TEST(Tensor, Slice) {
        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
    uintptr_t src_mutable_data_address =
        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
-            src_tensor.dims(), platform::CUDAPlace()));
+            src_tensor.dims(), platform::NPUPlace(0)));
    uintptr_t slice_data_address =
        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
    uintptr_t slice_mutable_data_address =
        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
-            slice_tensor.dims(), platform::CUDAPlace()));
+            slice_tensor.dims(), platform::NPUPlace(0)));
    EXPECT_EQ(src_data_address, src_mutable_data_address);
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // TODO(zhiqiu): handle different condition like CUDA code below
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {  /* npu -> npu*/
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
    return GetResultHelper(out, gpu);
  }

+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPlace& npu) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not supported on place (%s) ", npu));
+    // return GetResultHelper(out, npu);
+  }
+
  bool GetResult(const framework::Tensor& out,
                 const platform::CPUPlace& cpu) const {
    return *out.data<bool>();
@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
 #endif
  }

+  void VisitorImpl(const platform::NPUPlace& npu) const {
+    // TODO(zhiqiu)
+  }
+
  void VisitorImpl(const platform::CPUPlace& cpu) const {
    int num = in_.numel();
    const bool* in_ptr = in_.data<bool>();
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Leo Chen	463617d757	Revert "[NPU] add npu kernel for mean Op (#31562 )" This reverts commit `468ac6993b`.	4 years ago
OleNet	468ac6993b	[NPU] add npu kernel for mean Op (#31562 ) * update mean op * update mean op * give a better test activation Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
Leo Chen	5118968d80	[NPU] add npu kernel for softmax_with_cross_entropy (#31656 ) * init * fix bugs	4 years ago
zhang wenhui	925432d85e	【NPU】Support npu kernel for mul op (#31584 ) * add mul * add test mul	4 years ago
Leo Chen	1e956001ec	[NPU] add npu kernel for adam (#31644 ) * add npu kernel for adam * refine code * disable test * modify atol	4 years ago
pangyoki	795b0f92d3	【NPU】Support NPU kernel for reduce_sum op v2 (#31620 ) * add reduce_sum * fix broadcastd * fix test * fix * add unsqueeze in reduce_sum * add template * add unittest for keep_dim * test reduce_all Co-authored-by: frankwhzhang <frankwhzhang@126.com>	4 years ago
Leo Chen	b541ca8795	[NPU] add npu kernel for sgd (#31639 )	4 years ago
oyxuan-11	57220f594d	[NPU] Support NPU kernel cast op (#31635 ) Co-authored-by: frankwhzhang <frankwhzhang@126.com>	4 years ago
Leo Chen	3ca4bc1004	[NPU] fix allocator min chunk size (#31632 )	4 years ago
oyxuan-11	11f788771c	[NPU] Support npu kernel scatter op (#31624 ) * Support npu kernel scatter op * Add more test	4 years ago
Meiyim	e3e15792a4	[NPU] support npu kernel for `less_than` (#31327 ) * [npu] support npu kernel for `less than` * remove int* kernel * cleanup	4 years ago
Meiyim	a3cc4a4a69	[NPU] Support npu op table_lookup_v2 and table_lookup_v2_grad (#31399 ) * [npu] support npu kernel `table_lookup_v2` * clean up * +python test * +cmake * clean up * remove int8 kernel + python unitest for fp16 * clean up	4 years ago
oyjxer	f250416029	[NPU] Support npu op elementwise_pow (#31576 )	4 years ago
oyjxer	7241bc2210	[NPU] Support npu op elementwise_min (#31575 )	4 years ago
oyjxer	9606a86b18	[NPU] Support npu op logicalnot_op (#31534 )	4 years ago
oyjxer	47860ce20d	[NPU] Support npu op log, log_grad, sqrt, sqrt_grad, square, tanh and tanh_grad (#31600 )	4 years ago
oyjxer	de65486c19	【NPU】Support npu op elementwise_div and elementwise_div_grad (#31573 ) * Support npu op elementwise_div and elementwise_div_grad * Support npu op elementwise_div and elementwise_div_grad * Support npu op elementwise_div and elementwise_div_grad	4 years ago
OleNet	ec2160a622	[NPU] add range op (#31560 ) * add range op * fix codestyle; call GetSize directly Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
Leo Chen	0234693040	fix gather_grad bug (#31607 )	4 years ago
Leo Chen	5e851bff42	[NPU] fix assgin cmake (#31595 )	4 years ago
oyjxer	382fc31f89	【NPU】Support npu op gelu and gelu_grad (#31530 ) * Support npu op gelu and gelu_grad * Support npu op gelu and gelu_grad	4 years ago
oyjxer	5d29a27c2e	[NPU] fix npu op elementwise_mul_grad (#31592 )	4 years ago
OleNet	09bf2cfc0e	[NPU] add Assign OP (#31561 ) * add assign op * add test assign npu test * dele if def Co-authored-by: oyjxer <1728722986@qq.com>	4 years ago
xiayanming	f1fdddfdc8	[NPU] Support npu kernel for c sync stream op (#31386 ) * sync stream npu op * add with_ascend_acl * update c++ unittest	4 years ago
yinhaofeng	e1c33a6d69	[NPU] accuracy op (#31492 ) * accuracy op * fix license * fix * add test and fix bug	4 years ago
xiayanming	3bf8a34c69	[NPU] Support npu kernel for amp_check_finite_and_unscale_npu op (#31457 ) * Support npu kernel for amp_check_finite_and_unscale_npu op * support EnforceNotMet exception * fix exception bug * modify python unittest * precommit * update c++ unittest * fix review * fix review	4 years ago
xiayanming	d746197398	[NPU] Support npu kernel for gather op fix bug (#31541 ) * add gather npu op * code review done * update python new line * precommit * fix review * del commit * update gather_grad * fix bug * fix bug	4 years ago
zhang wenhui	5d22e15b6e	【NPU】Suppert npu kernel for reshape2 op (#31524 ) * add reshape2 npu * add reshpe2	4 years ago
zhang wenhui	581e5460a0	【NPU】add relu op for npu (#31515 ) * add relu npu * fixed * fix	4 years ago
oyjxer	cfeeb4bc95	[NPU] Support npu op elementwise_max (#31574 )	4 years ago
oyjxer	e15ccafb84	[NPU] Support npu op elementwise_mul and elementwise_mul_grad (#31571 )	4 years ago
zhang wenhui	29d50d2049	【NPU】Support npu kernel for matmul op (#31544 ) * add matmulv2_npu * add matmul * add matmul	4 years ago
xiayanming	f400ce9f51	[NPU] Support npu kernel for reduceany op (#31422 ) * add reduce any npu op * add gather python unittest * update c_plus unittest * update python unittest * del c++ unittest * update c++ unittest * update c++ unittest	4 years ago
zhang wenhui	7524ac9345	【NPU】support npu kernel for fill_constant op (#31521 ) * add fill_constant npu * add fill_constant npu * fix	4 years ago
Leo Chen	3f206e97c4	Support TensorFormVector, TensorToVector of bool type (#31518 ) * support TensorFormVector, TensorToVector of bool type * add ut * fix compile problem	4 years ago
zhang wenhui	9df84bd693	【NPU】add scale op for npu (#31499 ) * add scale npu * fix * fix	4 years ago
xiayanming	e19195f795	Support npu kernel for gather op (#31458 ) * add gather npu op * code review done * update python new line * precommit * fix review * del commit	4 years ago
lw921014	15823bb0df	[NPU] add npu kernel for communication op (#31437 ) * add allreduce and broadcast without test * add c_broadcast_test case * build c_comm_init and c_create_group operators * make the whole thing compile * add broadcast and init op test case but run failed * make unit test compile * fix broadcast test bug and change into hcom for ccl * change c_comm_init and c_create_group ops accordingly * make tests compile * transfer code to 27 * compiled successfully in 28, but run failed * test broadcast in 28, but failed * make hcom primitives work * change hccl data type for base.h * fix broadcast bug * make attributes work * fix group name bug * add allreduce but test failed * allreduce bug for qiuliang * allreduce finished * add allgather and reducescatter * merge all op code * add allgather test * finish run all ccl op test exclude send/recv * all all op and test exclude send/recv * send_v2_npu.cc recv_v2_npiu.cc compiled * fix ccl core dump bug and test allgather, reducescatter, broadcast op * fix allreduce bug just for test * hcom send&recv test pass, without hcom_destroy * for qiuliang test * Ascend Send&Recv Test Pass * all op (ex send/recv) ok * fix bug * merge all ccl op * style merge to PaddlePaddle * merge style * new merge style * merge style 2 * insert an empty at the end * disable ctest for hcom to pass ci Co-authored-by: void-main <voidmain1313113@gmail.com> Co-authored-by: f2hkop <f2huestc@outlook.com>	4 years ago
Reventon_L	388c69f27d	[NPU] squeeze and unsqueeze op for ascend (#31452 ) Co-authored-by: root <xiayanming@baidu.com>	4 years ago
Leo Chen	c956c035dc	fix cmake of cryptopp to avoid downloading every time (#31451 )	4 years ago
Leo Chen	83f81eb573	Fix pow, refine code (#31440 )	4 years ago
Leo Chen	5fe3d596e4	Fix pow, use fillD instead of broadcast (#31433 )	4 years ago
zhang wenhui	ecc6e213d7	fix endif (#31431 )	4 years ago
zhang wenhui	b3c88e961c	[NPU] Support npu kernel for shape op (#31427 ) * add shape npu * fix * fix	4 years ago
Leo Chen	ac3d821bc0	[NPU] add npu kernel for equal op (#31393 ) * add npu kernel for equal op * refine code * add more ut * update year	4 years ago
Leo Chen	0310945f5c	[NPU] Support npu op layer_norm and layer_norm_grad (#31310 ) * init commit, add layer_norm npu kernel * fix typo * add unittest * add unittest * fix bug * fix bug * refine ut	4 years ago
Void Main	45765d6eb6	Refactor HCCLCommContext to be compatible with Paddle (#31359 ) Refactor HCCLCommContext to be compatible with Paddle (#31359)	4 years ago
Leo Chen	8497e2aad3	[NPU] add npu kernel for elementwise_add_grad (#31347 ) * fix reading flags from env * fix problem caused by async run * support partial grad * support elementwise_add_grad npu kernel * add unittest * fix bug?	4 years ago
lw921014	9fcdaeba5e	add allreduce and broadcast without test (#31024 ) add allreduce and broadcast without test	4 years ago
Leo Chen	5618f14047	fix reading flags from env (#31329 )	4 years ago
liym27	a1ddff81e3	[NPU] Support npu op: (1) slice (2) slice_grad (#31275 )	4 years ago
Leo Chen	d23bf89cf6	support list of list attribute for NPU (#31299 ) * support list of list attribute for NPU * fix compile problem * fix reference	4 years ago
liym27	77a0c41cb2	Fix pow npu fp16 test (#31256 )	4 years ago
liym27	187248f568	[NPU] Support npu op pow and pow grad (#31247 ) * [NPU] Support npu op: (1) pow (2) pow_grad * Support fp16	4 years ago
xiayanming	821c2f4ef8	add ascend unittest (#31249 ) add ascend unittest	4 years ago
Leo Chen	d45f5d787e	Fix typo of selected_npus (#31230 )	4 years ago
xiayanming	387c1db4f1	Ascendrc (#31065 ) Ascendrc	4 years ago
Leo Chen	ff4654e216	refactor npu device manager (#31154 ) refactor npu device manager (#31154)	4 years ago
liym27	1435b4c096	[NPU] Support executor with NPU (#31057 ) * [NPU] Support executor with NPU * Fix code according to reviews * Fix code * Add unittest for sub op npu	4 years ago
Leo Chen	678a3e8fed	support adding correct npu op in pybind.h (#31143 ) * support adding correct npu op in pybind.h * refine code	4 years ago
Leo Chen	85cbd55648	Fix compilation problem (#31100 ) Fix compilation problem (#31100)	4 years ago
Leo Chen	5cb20f30fc	add npu kernel for elementwise_sub and elementwise_sub_grad (#30973 ) * add npu sub op * fix typo * rename test * fix bug * fix bug * add fp16 kernel * fix typo * support sub grad op * support elementwise_sub_grad op Co-authored-by: frankwhzhang <frankwhzhang@126.com>	4 years ago
gongweibao	c687edecd8	Fix reshape on GE graph. (#31084 ) Fix reshape on GE graph	4 years ago
xiayanming	a6edbc478b	support parsing ascend rank table file (#31000 ) support parsing ascend rank table file	4 years ago
Leo Chen	1201cd2ef2	[feature] support npu allocator, part 2 (#30972 ) * support npu allocator * add npu device context * fix some compile problem * fix some compile problem * add npu info * compile ok * fix include dir * support naive_best_fit_allocator * run ut ok, bug failed to exit * call aclrtResetDevice before exit * fix aclFinilize * add system allocatot test * add selected_gpus in gtest * add tensor_test for npu * support npu op, initial commit * add npu stream * add elementwise_add_op * compile ok * fix typo * fix elementwise_add_op_npu_test * support op run * test can run but failed * change aclopExecuteV2 to aclopCompileAndExecute	4 years ago
Leo Chen	7e049108c5	[feature] support npu operator (#30951 ) [feature] support npu operator	4 years ago
Leo Chen	81138239db	[feature] support npu allocator (#30840 ) [feature] support npu allocator	4 years ago
gongweibao	ebef6601d5	Destroy session first. (#30954 ) Destroy session first.	4 years ago
Leo Chen	500f28ec37	pass cxx_flags to gloo cmake (#30857 )	4 years ago
gongweibao	de42d19336	Add paddle ascend distribution training supported (#30796 ) Add paddle ascend distribution training supported	4 years ago
OleNet	ebb5d181e8	Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug (#30797 ) Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug	4 years ago
dingsiyu	4a26729540	Merge ascend_optimizer and ascend_parser. (#30776 ) Merge ascend_optimizer and ascend_parser.	4 years ago
gongweibao	636fefd9f8	code style (#30781 ) code style	4 years ago
Leo Chen	88dfd067bf	Dev/fix ascend string (#30749 ) Dev/fix ascend string	4 years ago
Leo Chen	6eabbc8076	fix compilation on ascend-20.1 (#30722 ) fix compilation on ascend-20.1	4 years ago
Void Main	904cc44349	[Feature] Build parser to support distributed training (#30658 ) [Feature] Build parser to support distributed training	4 years ago
gongweibao	5b77b259d8	cleanup (#30646 ) cleanup test_ascend_group.py	4 years ago
gongweibao	7158061a29	Add startup bash files of test_ascend_group. (#30645 ) Add startup bash files of test_ascend_group	4 years ago
gongweibao	e4287ca60b	Add Hccl program group (#30642 ) Add Hccl program group	4 years ago
gongweibao	f5aca8fbb4	Pass device_ids info from launch to trainer. (#30632 ) Pass device_ids info from launch to trainer	4 years ago
Void Main	d2404da768	Build praser for Hcom* operators (#30627 ) Build praser for Hcom* operators	4 years ago
gongweibao	f9c97dd728	Add distribution supported (#30578 ) Add distribution supported	4 years ago
gongweibao	1882f2ce2d	Fix compilcation on CANN20.1 and older (#30494 ) Fix compilcation on CANN20.1 and older	4 years ago
hutuxian	6dd52c5b25	Ascend rc (#30483 )	4 years ago