Compare commits

...

84 Commits

Author SHA1 Message Date
Leo Chen 463617d757 Revert "[NPU] add npu kernel for mean Op (#31562)"
4 years ago
OleNet 468ac6993b
[NPU] add npu kernel for mean Op (#31562)
4 years ago
Leo Chen 5118968d80
[NPU] add npu kernel for softmax_with_cross_entropy (#31656)
4 years ago
zhang wenhui 925432d85e
【NPU】Support npu kernel for mul op (#31584)
4 years ago
Leo Chen 1e956001ec
[NPU] add npu kernel for adam (#31644)
4 years ago
pangyoki 795b0f92d3
【NPU】Support NPU kernel for reduce_sum op v2 (#31620)
4 years ago
Leo Chen b541ca8795
[NPU] add npu kernel for sgd (#31639)
4 years ago
oyxuan-11 57220f594d
[NPU] Support NPU kernel cast op (#31635)
4 years ago
Leo Chen 3ca4bc1004
[NPU] fix allocator min chunk size (#31632)
4 years ago
oyxuan-11 11f788771c
[NPU] Support npu kernel scatter op (#31624)
4 years ago
Meiyim e3e15792a4
[NPU] support npu kernel for `less_than` (#31327)
4 years ago
Meiyim a3cc4a4a69
[NPU] Support npu op table_lookup_v2 and table_lookup_v2_grad (#31399)
4 years ago
oyjxer f250416029
[NPU] Support npu op elementwise_pow (#31576)
4 years ago
oyjxer 7241bc2210
[NPU] Support npu op elementwise_min (#31575)
4 years ago
oyjxer 9606a86b18
[NPU] Support npu op logicalnot_op (#31534)
4 years ago
oyjxer 47860ce20d
[NPU] Support npu op log, log_grad, sqrt, sqrt_grad, square, tanh and tanh_grad (#31600)
4 years ago
oyjxer de65486c19
【NPU】Support npu op elementwise_div and elementwise_div_grad (#31573)
4 years ago
OleNet ec2160a622
[NPU] add range op (#31560)
4 years ago
Leo Chen 0234693040
fix gather_grad bug (#31607)
4 years ago
Leo Chen 5e851bff42
[NPU] fix assgin cmake (#31595)
4 years ago
oyjxer 382fc31f89
【NPU】Support npu op gelu and gelu_grad (#31530)
4 years ago
oyjxer 5d29a27c2e
[NPU] fix npu op elementwise_mul_grad (#31592)
4 years ago
OleNet 09bf2cfc0e
[NPU] add Assign OP (#31561)
4 years ago
xiayanming f1fdddfdc8
[NPU] Support npu kernel for c sync stream op (#31386)
4 years ago
yinhaofeng e1c33a6d69
[NPU] accuracy op (#31492)
4 years ago
xiayanming 3bf8a34c69
[NPU] Support npu kernel for amp_check_finite_and_unscale_npu op (#31457)
4 years ago
xiayanming d746197398
[NPU] Support npu kernel for gather op fix bug (#31541)
4 years ago
zhang wenhui 5d22e15b6e
【NPU】Suppert npu kernel for reshape2 op (#31524)
4 years ago
zhang wenhui 581e5460a0
【NPU】add relu op for npu (#31515)
4 years ago
oyjxer cfeeb4bc95
[NPU] Support npu op elementwise_max (#31574)
4 years ago
oyjxer e15ccafb84
[NPU] Support npu op elementwise_mul and elementwise_mul_grad (#31571)
4 years ago
zhang wenhui 29d50d2049
【NPU】Support npu kernel for matmul op (#31544)
4 years ago
xiayanming f400ce9f51
[NPU] Support npu kernel for reduceany op (#31422)
4 years ago
zhang wenhui 7524ac9345
【NPU】support npu kernel for fill_constant op (#31521)
4 years ago
Leo Chen 3f206e97c4
Support TensorFormVector, TensorToVector of bool type (#31518)
4 years ago
zhang wenhui 9df84bd693
【NPU】add scale op for npu (#31499)
4 years ago
xiayanming e19195f795
Support npu kernel for gather op (#31458)
4 years ago
lw921014 15823bb0df
[NPU] add npu kernel for communication op (#31437)
4 years ago
Reventon_L 388c69f27d
[NPU] squeeze and unsqueeze op for ascend (#31452)
4 years ago
Leo Chen c956c035dc
fix cmake of cryptopp to avoid downloading every time (#31451)
4 years ago
Leo Chen 83f81eb573
Fix pow, refine code (#31440)
4 years ago
Leo Chen 5fe3d596e4
Fix pow, use fillD instead of broadcast (#31433)
4 years ago
zhang wenhui ecc6e213d7
fix endif (#31431)
4 years ago
zhang wenhui b3c88e961c
[NPU] Support npu kernel for shape op (#31427)
4 years ago
Leo Chen ac3d821bc0
[NPU] add npu kernel for equal op (#31393)
4 years ago
Leo Chen 0310945f5c
[NPU] Support npu op layer_norm and layer_norm_grad (#31310)
4 years ago
Void Main 45765d6eb6
Refactor HCCLCommContext to be compatible with Paddle (#31359)
4 years ago
Leo Chen 8497e2aad3
[NPU] add npu kernel for elementwise_add_grad (#31347)
4 years ago
lw921014 9fcdaeba5e
add allreduce and broadcast without test (#31024)
4 years ago
Leo Chen 5618f14047
fix reading flags from env (#31329)
4 years ago
liym27 a1ddff81e3
[NPU] Support npu op: (1) slice (2) slice_grad (#31275)
4 years ago
Leo Chen d23bf89cf6
support list of list attribute for NPU (#31299)
4 years ago
liym27 77a0c41cb2
Fix pow npu fp16 test (#31256)
4 years ago
liym27 187248f568
[NPU] Support npu op pow and pow grad (#31247)
4 years ago
xiayanming 821c2f4ef8
add ascend unittest (#31249)
4 years ago
Leo Chen d45f5d787e
Fix typo of selected_npus (#31230)
4 years ago
xiayanming 387c1db4f1
Ascendrc (#31065)
4 years ago
Leo Chen ff4654e216
refactor npu device manager (#31154)
4 years ago
liym27 1435b4c096
[NPU] Support executor with NPU (#31057)
4 years ago
Leo Chen 678a3e8fed
support adding correct npu op in pybind.h (#31143)
4 years ago
Leo Chen 85cbd55648
Fix compilation problem (#31100)
4 years ago
Leo Chen 5cb20f30fc
add npu kernel for elementwise_sub and elementwise_sub_grad (#30973)
4 years ago
gongweibao c687edecd8
Fix reshape on GE graph. (#31084)
4 years ago
xiayanming a6edbc478b
support parsing ascend rank table file (#31000)
4 years ago
Leo Chen 1201cd2ef2
[feature] support npu allocator, part 2 (#30972)
4 years ago
Leo Chen 7e049108c5
[feature] support npu operator (#30951)
4 years ago
Leo Chen 81138239db
[feature] support npu allocator (#30840)
4 years ago
gongweibao ebef6601d5
Destroy session first. (#30954)
4 years ago
Leo Chen 500f28ec37
pass cxx_flags to gloo cmake (#30857)
4 years ago
gongweibao de42d19336
Add paddle ascend distribution training supported (#30796)
4 years ago
OleNet ebb5d181e8
Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug (#30797)
4 years ago
dingsiyu 4a26729540
Merge ascend_optimizer and ascend_parser. (#30776)
4 years ago
gongweibao 636fefd9f8
code style (#30781)
4 years ago
Leo Chen 88dfd067bf
Dev/fix ascend string (#30749)
4 years ago
Leo Chen 6eabbc8076
fix compilation on ascend-20.1 (#30722)
4 years ago
Void Main 904cc44349
[Feature] Build parser to support distributed training (#30658)
4 years ago
gongweibao 5b77b259d8
cleanup (#30646)
4 years ago
gongweibao 7158061a29
Add startup bash files of test_ascend_group. (#30645)
4 years ago
gongweibao e4287ca60b
Add Hccl program group (#30642)
4 years ago
gongweibao f5aca8fbb4
Pass device_ids info from launch to trainer. (#30632)
4 years ago
Void Main d2404da768
Build praser for Hcom* operators (#30627)
4 years ago
gongweibao f9c97dd728
Add distribution supported (#30578)
4 years ago
gongweibao 1882f2ce2d
Fix compilcation on CANN20.1 and older (#30494)
4 years ago
hutuxian 6dd52c5b25
Ascend rc (#30483)
4 years ago

@ -31,9 +31,17 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
if (WITH_GPU AND WITH_ASCEND)
message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
@ -57,6 +65,10 @@ if(WITH_MUSL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
endif()
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)

@ -78,6 +78,14 @@ if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS)
endif()
if(WITH_ASCEND)
add_definitions(-DPADDLE_WITH_ASCEND)
endif()
if(WITH_ASCEND_CL)
add_definitions(-DPADDLE_WITH_ASCEND_CL)
endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)

@ -0,0 +1,84 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#NOTE: Logic is from
# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
if(DEFINED ENV{ASCEND_CUSTOM_PATH})
set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
else()
set(ASCEND_DIR /usr/local/Ascend)
endif()
if(WITH_ASCEND)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
endif()
if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
endif()

@ -53,6 +53,7 @@ ExternalProject_Add(
"${CRYPTOPP_DOWNLOAD_CMD}"
PREFIX ${CRYPTOPP_PREFIX_DIR}
SOURCE_DIR ${CRYPTOPP_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND
COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"

@ -42,7 +42,7 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build
&& cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
&& cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
&& mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"

@ -198,8 +198,19 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
"-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
ENDIF()
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
else()
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
# SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
# SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
endif()
cache_third_party(${TARGET_NAME}
REPOSITORY ${PROTOBUF_REPOSITORY}
@ -234,7 +245,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1.0)
SET(PROTOBUF_VERSION 3.8.0)
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)

@ -16,7 +16,7 @@ INCLUDE(ExternalProject)
SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
cache_third_party(extern_threadpool

@ -17,8 +17,9 @@ INCLUDE(ExternalProject)
SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git)
set(WARPCTC_TAG 95a461eddeabd51099ef059dcfada1117eb1bfb8)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
@ -52,7 +53,7 @@ ExternalProject_Add(
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}

@ -151,6 +151,8 @@ set(COMMON_FLAGS
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0 # Warning in tinyformat.h
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
-Wno-error=nonnull-compare # Warning in boost gcc 7.2
-Wno-error=address # Warning in boost gcc 7.2
${fsanitize}
)

@ -440,9 +440,19 @@ function(cc_test TARGET_NAME)
cc_test_build(${TARGET_NAME}
SRCS ${cc_test_SRCS}
DEPS ${cc_test_DEPS})
cc_test_run(${TARGET_NAME}
COMMAND ${TARGET_NAME}
ARGS ${cc_test_ARGS})
# we dont test hcom op, because it need complex configuration
# with more than one machine
if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
cc_test_run(${TARGET_NAME}
COMMAND ${TARGET_NAME}
ARGS ${cc_test_ARGS})
endif()
endif()
endfunction(cc_test)
@ -859,7 +869,7 @@ function(py_test TARGET_NAME)
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
if (WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
endif()

@ -11,12 +11,16 @@ function(op_library TARGET)
set(miopen_hip_cc_srcs)
set(cu_cc_srcs)
set(xpu_cc_srcs)
set(npu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(cudnn_cu_srcs)
set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
if (WITH_ASCEND_CL)
set(op_common_deps ${op_common_deps} npu_op_runner)
endif()
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
set(options UNITY)
set(oneValueArgs "")
@ -84,6 +88,12 @@ function(op_library TARGET)
list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif()
endif()
if(WITH_ASCEND_CL)
string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$")
@ -106,6 +116,8 @@ function(op_library TARGET)
list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
list(APPEND npu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
@ -170,7 +182,7 @@ function(op_library TARGET)
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
if(WITH_UNITY_BUILD AND op_library_UNITY)
# Combine the cc source files.
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
if(TARGET ${UNITY_TARGET})
# If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@ -181,7 +193,7 @@ function(op_library TARGET)
# Add alias library to handle dependencies.
add_library(${TARGET} ALIAS ${UNITY_TARGET})
else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
endif()
@ -201,6 +213,7 @@ function(op_library TARGET)
# The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
set(ORIGINAL_TARGET ${TARGET})
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
# [ \t\r\n]* is used for blank characters
@ -230,10 +243,11 @@ function(op_library TARGET)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH npu_cc_srcs npu_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
@ -273,6 +287,26 @@ function(op_library TARGET)
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()
if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
# It is different from the logic above, becareful
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
# [ \t\r\n]* is used for blank characters
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
if (one_npu_register STREQUAL "")
string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
# [ \t\r\n]+ is used for blank characters.
# Here we use '+' instead of '*' since it is a REPLACE operation.
string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
endif()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
@ -323,6 +357,7 @@ function(register_operators)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE "_npu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len)

@ -274,6 +274,16 @@ if(WITH_BOX_PS)
list(APPEND third_party_deps extern_box_ps)
endif(WITH_BOX_PS)
if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend)
if(WITH_ASCEND)
list(APPEND third_party_deps extern_ascend)
endif()
if(WITH_ASCEND_CL)
list(APPEND third_party_deps extern_ascend_cl)
endif()
endif ()
if (WITH_PSCORE)
include(external/snappy)
list(APPEND third_party_deps extern_snappy)

@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::NPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::NPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA
::DLContext ctx;

@ -466,6 +466,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(ascendrc): Support garbage collector on NPUPlace
VLOG(4) << "Skip NPU gc because it is not implemented now.";
#else
PADDLE_THROW(platform::errors::Unimplemented(
"No NPU gc found in CPU/GPU/XPU paddle"));
#endif
}
}

@ -31,3 +31,7 @@ endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
if(WITH_ASCEND)
cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
endif(WITH_ASCEND)

@ -0,0 +1,22 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND
#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
namespace paddle {
namespace framework {
std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
} // end namespace framework
} // end namespace paddle
#endif

@ -0,0 +1,208 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND
#include <glog/logging.h>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
#include "ge/ge_api.h"
#include "ge/ge_api_types.h"
#include "graph/attr_value.h"
#include "graph/tensor.h"
#include "graph/types.h"
namespace paddle {
namespace framework {
typedef ge::Graph AscendGraphDesc;
#ifdef PADDLE_WITH_ASCEND_STRING
using AscendString = ge::AscendString;
#else
using AscendString = std::string;
#endif
class AscendInstance {
public:
virtual ~AscendInstance() {}
AscendInstance() {}
std::map<AscendString, AscendString> _GetDefaultInitOptions() {
std::map<AscendString, AscendString> init_options;
init_options["ge.exec.deviceId"] = "0";
init_options["ge.graphRunMode"] = "1";
return init_options;
}
std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
std::map<AscendString, AscendString> init_options;
// init_options["a"] = "b";
// init_options["ge.trainFlag"] = "1";
return init_options;
}
ge::Status InitGEForUT() {
return ge::GEInitialize(_GetDefaultInitOptions());
}
void InitGlobalResouces() {
LOG(INFO) << "Begin ascend InitGlobalResouces";
session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
if (session_ == nullptr) {
LOG(FATAL) << "new session error:" << session_;
}
LOG(INFO) << "End ascend InitGlobalResouces";
}
void DestroyGlobalResouces() {
LOG(INFO) << "Begin ascend DestroyGlobalResouces";
session_ = nullptr;
LOG(INFO) << "Begin ascend DestroyGlobalResouces";
}
static std::shared_ptr<AscendInstance> GetInstance() {
if (nullptr == ascend_instance_) {
ascend_instance_.reset(new paddle::framework::AscendInstance());
VLOG(1) << "Initialize AscendInstance Done";
}
return ascend_instance_;
}
void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
ge::Status status = session_->AddGraph(graph_idx, graph);
PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
paddle::platform::errors::PreconditionNotMet(
"Calling addGraph of graph engine failed, please "
"check Ascend Log."));
VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
}
ge::DataType VarTypeToGeType(proto::VarType::Type type) {
if (type == proto::VarType::FP16) {
return ge::DataType::DT_FLOAT16;
} else if (type == proto::VarType::FP32) {
return ge::DataType::DT_FLOAT;
} else if (type == proto::VarType::FP64) {
return ge::DataType::DT_DOUBLE;
} else if (type == proto::VarType::INT32) {
return ge::DataType::DT_INT32;
} else if (type == proto::VarType::INT64) {
return ge::DataType::DT_INT64;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Not support %s as tensor type.", DataTypeToString(type)));
}
}
int GeTypeSize(proto::VarType::Type type) {
if (type == proto::VarType::FP16) {
return 2;
} else if (type == proto::VarType::FP32) {
return 4;
} else if (type == proto::VarType::FP64) {
return 8;
} else if (type == proto::VarType::INT32) {
return 4;
} else if (type == proto::VarType::INT64) {
return 8;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Not support %s as tensor type.", DataTypeToString(type)));
}
}
ge::Tensor ConvertToGeTensor(const Tensor *tensor) {
auto numel = tensor->numel();
std::vector<int64_t> vec_dim;
auto dimen = arity(tensor->dims());
for (auto i = 0; i < dimen; ++i) {
vec_dim.push_back(tensor->dims()[i]);
}
// For Debug
// VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
// ", and shape is";
// for (const auto e : vec_dim) {
// VLOG(0) << e;
// }
ge::Shape shape(vec_dim);
ge::TensorDesc tensor_desc(shape, ge::Format::FORMAT_ND,
VarTypeToGeType(tensor->type()));
tensor_desc.SetRealDimCnt(vec_dim.size());
const uint8_t *data =
reinterpret_cast<const uint8_t *>(tensor->data<void>());
std::vector<uint8_t> dst(numel * GeTypeSize(tensor->type()));
memcpy(dst.data(), data, GeTypeSize(tensor->type()) * numel);
ge::Tensor ge_tensor(tensor_desc, dst);
return ge_tensor;
}
void RunAscendSubgraph(int graph_idx,
const std::vector<const Tensor *> &inputs,
std::vector<Tensor *> *outputs) {
VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
// Convert paddle Tensor to GE Tensor
std::vector<ge::Tensor> ge_inputs;
for (const auto &e : inputs) {
ge_inputs.push_back(ConvertToGeTensor(e));
}
// Run Graph
std::vector<ge::Tensor> ge_outputs;
ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
paddle::platform::errors::PreconditionNotMet(
"Calling RunGraph of graph engine failed, please "
"check Ascend Log."));
VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
// change tensor back, note all tensor's type computed in GE is uint8
for (size_t i = 0; i < ge_outputs.size(); ++i) {
const uint8_t *ret_data = ge_outputs[i].GetData();
size_t size = ge_outputs[i].GetSize();
VLOG(1) << "GE Tensor size of the " << i << "th output var is " << size;
auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
platform::CPUPlace());
memcpy(dst, ret_data, size);
// Following for debug:
// VLOG(0) << "output for " << i << " var: ";
// float *tmp = reinterpret_cast<float*>(dst);
// for (size_t j = 0; j < size / 4; ++j) {
// printf("%f ", tmp[j]);
// }
// printf("\n");
}
}
protected:
std::shared_ptr<ge::Session> session_;
private:
static std::shared_ptr<AscendInstance> ascend_instance_;
};
} // namespace framework
} // namespace paddle
#endif

@ -89,7 +89,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
: GarbageCollector(place, max_memory_size) {
platform::CUDADeviceGuard guard(place.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
callback_manager_.reset(new platform::StreamCallbackManager(stream_));
callback_manager_.reset(
new platform::StreamCallbackManager<cudaStream_t>(stream_));
}
StreamGarbageCollector::~StreamGarbageCollector() {

@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector {
private:
cudaStream_t stream_;
std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
std::unique_ptr<platform::StreamCallbackManager<cudaStream_t>>
callback_manager_;
};
class CUDAPinnedGarbageCollector : public GarbageCollector {

@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
return LibraryType::kPlain;
} else if (s == std::string("XPU")) {
return LibraryType::kPlain;
} else if (s == std::string("NPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) {
return LibraryType::kPlain;
} else {

@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \
customized_type_value, \

@ -212,6 +212,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_npu_place(place)) {
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with NPU support.",
place));
#else
auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
platform::SetNPUDeviceId(dev_id);
#endif
}
@ -1265,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound(

@ -614,6 +614,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy,
ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
platform::errors::Unavailable(
"NPU is not supported in ParallelExecutor"));
InitP2P(places);
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size());

@ -125,25 +125,54 @@ TEST(Tensor, MutableData) {
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::CUDAPlace());
platform::CUDAPlace(0));
auto p1_holder = src_tensor.Holder();
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
platform::CUDAPlace());
platform::CUDAPlace(0));
auto p2_holder = src_tensor.Holder();
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1_holder.get(), p2_holder.get());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::CUDAPlace());
platform::CUDAPlace(0));
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::CUDAPlace());
platform::CUDAPlace(0));
EXPECT_EQ(p1, p2);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
framework::Tensor src_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::NPUPlace(0));
auto p1_holder = src_tensor.Holder();
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
platform::NPUPlace(0));
auto p2_holder = src_tensor.Holder();
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1_holder.get(), p2_holder.get());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::NPUPlace(0));
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::NPUPlace(0));
EXPECT_EQ(p1, p2);
}
#endif
@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) {
framework::Tensor src_tensor;
framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::CUDAPlace());
platform::CUDAPlace(0));
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
framework::Tensor src_tensor;
framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::NPUPlace(0));
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
@ -216,7 +255,34 @@ TEST(Tensor, Slice) {
{
framework::Tensor src_tensor;
src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
platform::CUDAPlace());
platform::CUDAPlace(0));
framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4);
EXPECT_EQ(slice_dims[1], 9);
uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address =
reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
src_tensor.dims(), platform::CUDAPlace(0)));
uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
slice_tensor.dims(), platform::CUDAPlace(0)));
EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
{
framework::Tensor src_tensor;
src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
platform::NPUPlace(0));
framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
@ -227,12 +293,12 @@ TEST(Tensor, Slice) {
reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address =
reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
src_tensor.dims(), platform::CUDAPlace()));
src_tensor.dims(), platform::NPUPlace(0)));
uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
slice_tensor.dims(), platform::CUDAPlace()));
slice_tensor.dims(), platform::NPUPlace(0)));
EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);

@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(zhiqiu): handle different condition like CUDA code below
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
stream);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { /* npu -> cpu*/
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* cpu -> npu*/
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
nullptr);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* npu -> npu*/
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
nullptr);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
return GetResultHelper(out, gpu);
}
bool GetResult(const framework::Tensor& out,
const platform::NPUPlace& npu) const {
PADDLE_THROW(
platform::errors::Unimplemented("Not supported on place (%s) ", npu));
// return GetResultHelper(out, npu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
#endif
}
void VisitorImpl(const platform::NPUPlace& npu) const {
// TODO(zhiqiu)
}
void VisitorImpl(const platform::CPUPlace& cpu) const {
int num = in_.numel();
const bool* in_ptr = in_.data<bool>();

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save