Compare commits

...

125 Commits

Author SHA1 Message Date
Leo Chen bc7a3afa68
【NPU】fix bug of using temp vector (#31963)
4 years ago
Leo Chen a6343afc70
[NPU] support npu for memcpy op (#31808)
4 years ago
An Improved PeleeNet Algorithm with Feature Pyramid Networks for Image Detection 3ab39705ea
adapter npu (#31926)
4 years ago
Leo Chen ac89174e5a
[NPU] support GarbageCollector for npu (#31874)
4 years ago
Leo Chen 3c66b8721a
[NPU] add npu kernel for truncated_gaussian_random op (#31654)
4 years ago
liym27 4a823c5f63
[NPU] support fp16 of input for api pow (#31871)
4 years ago
Leo Chen f354e1d6d5
[NPU] fix some op bugs (#31855)
4 years ago
zhang wenhui 9754d0a7cb
【NPU】Add int dtype kernel for reshape2 op (#31864)
4 years ago
Leo Chen 48aa92234d
[NPU] support npu for conditional_block op (#31854)
4 years ago
Leo Chen a93488839d
fix compile problem (#31850)
4 years ago
pangyoki 0279486b02
【NPU】Support npu kernel for update_loss_scaling op (#31830)
4 years ago
pangyoki b2407af6e3
[NPU] support mixed precision input for npu layer norm (#31847)
4 years ago
Leo Chen d1a4c53eee
[NPU] support default stream (#31510)
4 years ago
Leo Chen fead563156
[NPU] fix bug of lookup_table_v2_grad (#31834)
4 years ago
liym27 149f76e636
[NPU] Support npu kernel for op elementwise_floordiv (#31822)
4 years ago
Leo Chen b3446670c1
[NPU] add npu kernel for concat op (#31695)
4 years ago
Leo Chen 03803f20fd
[NPU] support list of tensor input (#31801)
4 years ago
Leo Chen 6350528220
[NPU] support fp16 for npu accuracy op (#31797)
4 years ago
lw921014 c594f57685
add c_reduce_sum op (#31793)
4 years ago
lilong12 228bce12c8
Add 3d parallelism (#31796)
4 years ago
zhang wenhui 594bbcb189
【NPU】Fix reshape test & add grad test (#31776)
4 years ago
xiayanming fba994c28b
[NPU] fix the grad kernel diff bug of gather op (#31757)
4 years ago
oyjxer 02912ce7f2
【NPU】Fix npu kernel elementwise_div_grad (#31753)
4 years ago
Leo Chen e6af7c0dd8
[NPU] fix some bugs of npu op (#31739)
4 years ago
OleNet 17862b725f
[NPU] Support mean npu kernel (#31729)
4 years ago
Leo Chen 342252c902
[NPU] change transpose to transpose2 (#31734)
4 years ago
Void Main 7b450e7889
Add auto-increasing tag id for Hcom OPs (#31702)
4 years ago
Leo Chen 50bc11621f
[NPU] fix reshape npu op kernel (#31726)
4 years ago
liym27 c8729f2aec
[NPU] Remove redundant ctest of top_k_op_npu_test (#31718)
4 years ago
An Improved PeleeNet Algorithm with Feature Pyramid Networks for Image Detection f8e1f452c4
ascend_communicate (#31708)
4 years ago
oyxuan-11 faf40da585
[NPU] Support NPU kernel of stack op (#31711)
4 years ago
liym27 d55120d77f
[NPU] Support testing grad of NPU ops in OpTest (#31697)
4 years ago
liym27 e424712073
[NPU] Fix bug: Fix calculation errors of pow grad npu kernel (#31699)
4 years ago
OleNet 7ec8459c6c
[NPU] Support softmax npu kernel (#31564)
4 years ago
Meiyim 7875bcb8f7
[NPU] npu support `transpose` (#31486)
4 years ago
oyxuan-11 125201ee56
[NPU] Support NPU kernel sum op (#31671)
4 years ago
OleNet ef15544ee0
[NPU] add NPU add topk (#31596)
4 years ago
OleNet 743cc9b29b
[NPU] add Increment op (#31563)
4 years ago
Leo Chen 1de6daff82
[NPU] fix shape of dx in mul_grad (#31675)
4 years ago
Meiyim 3dd992e24f
[NPU] Support npu op `expand` (#31405)
4 years ago
pangyoki 444c285202
【NPU】Add TensorCopy to NPU kernel for reduce_sum op (#31667)
4 years ago
Leo Chen 8f08f160c6
Revert "[NPU] add npu kernel for mean Op (#31562)" (#31665)
4 years ago
OleNet 468ac6993b
[NPU] add npu kernel for mean Op (#31562)
4 years ago
Leo Chen 5118968d80
[NPU] add npu kernel for softmax_with_cross_entropy (#31656)
4 years ago
zhang wenhui 925432d85e
【NPU】Support npu kernel for mul op (#31584)
4 years ago
Leo Chen 1e956001ec
[NPU] add npu kernel for adam (#31644)
4 years ago
pangyoki 795b0f92d3
【NPU】Support NPU kernel for reduce_sum op v2 (#31620)
4 years ago
Leo Chen b541ca8795
[NPU] add npu kernel for sgd (#31639)
4 years ago
oyxuan-11 57220f594d
[NPU] Support NPU kernel cast op (#31635)
4 years ago
Leo Chen 3ca4bc1004
[NPU] fix allocator min chunk size (#31632)
4 years ago
oyxuan-11 11f788771c
[NPU] Support npu kernel scatter op (#31624)
4 years ago
Meiyim e3e15792a4
[NPU] support npu kernel for `less_than` (#31327)
4 years ago
Meiyim a3cc4a4a69
[NPU] Support npu op table_lookup_v2 and table_lookup_v2_grad (#31399)
4 years ago
oyjxer f250416029
[NPU] Support npu op elementwise_pow (#31576)
4 years ago
oyjxer 7241bc2210
[NPU] Support npu op elementwise_min (#31575)
4 years ago
oyjxer 9606a86b18
[NPU] Support npu op logicalnot_op (#31534)
4 years ago
oyjxer 47860ce20d
[NPU] Support npu op log, log_grad, sqrt, sqrt_grad, square, tanh and tanh_grad (#31600)
4 years ago
oyjxer de65486c19
【NPU】Support npu op elementwise_div and elementwise_div_grad (#31573)
4 years ago
OleNet ec2160a622
[NPU] add range op (#31560)
4 years ago
Leo Chen 0234693040
fix gather_grad bug (#31607)
4 years ago
Leo Chen 5e851bff42
[NPU] fix assgin cmake (#31595)
4 years ago
oyjxer 382fc31f89
【NPU】Support npu op gelu and gelu_grad (#31530)
4 years ago
oyjxer 5d29a27c2e
[NPU] fix npu op elementwise_mul_grad (#31592)
4 years ago
OleNet 09bf2cfc0e
[NPU] add Assign OP (#31561)
4 years ago
xiayanming f1fdddfdc8
[NPU] Support npu kernel for c sync stream op (#31386)
4 years ago
yinhaofeng e1c33a6d69
[NPU] accuracy op (#31492)
4 years ago
xiayanming 3bf8a34c69
[NPU] Support npu kernel for amp_check_finite_and_unscale_npu op (#31457)
4 years ago
xiayanming d746197398
[NPU] Support npu kernel for gather op fix bug (#31541)
4 years ago
zhang wenhui 5d22e15b6e
【NPU】Suppert npu kernel for reshape2 op (#31524)
4 years ago
zhang wenhui 581e5460a0
【NPU】add relu op for npu (#31515)
4 years ago
oyjxer cfeeb4bc95
[NPU] Support npu op elementwise_max (#31574)
4 years ago
oyjxer e15ccafb84
[NPU] Support npu op elementwise_mul and elementwise_mul_grad (#31571)
4 years ago
zhang wenhui 29d50d2049
【NPU】Support npu kernel for matmul op (#31544)
4 years ago
xiayanming f400ce9f51
[NPU] Support npu kernel for reduceany op (#31422)
4 years ago
zhang wenhui 7524ac9345
【NPU】support npu kernel for fill_constant op (#31521)
4 years ago
Leo Chen 3f206e97c4
Support TensorFormVector, TensorToVector of bool type (#31518)
4 years ago
zhang wenhui 9df84bd693
【NPU】add scale op for npu (#31499)
4 years ago
xiayanming e19195f795
Support npu kernel for gather op (#31458)
4 years ago
lw921014 15823bb0df
[NPU] add npu kernel for communication op (#31437)
4 years ago
Reventon_L 388c69f27d
[NPU] squeeze and unsqueeze op for ascend (#31452)
4 years ago
Leo Chen c956c035dc
fix cmake of cryptopp to avoid downloading every time (#31451)
4 years ago
Leo Chen 83f81eb573
Fix pow, refine code (#31440)
4 years ago
Leo Chen 5fe3d596e4
Fix pow, use fillD instead of broadcast (#31433)
4 years ago
zhang wenhui ecc6e213d7
fix endif (#31431)
4 years ago
zhang wenhui b3c88e961c
[NPU] Support npu kernel for shape op (#31427)
4 years ago
Leo Chen ac3d821bc0
[NPU] add npu kernel for equal op (#31393)
4 years ago
Leo Chen 0310945f5c
[NPU] Support npu op layer_norm and layer_norm_grad (#31310)
4 years ago
Void Main 45765d6eb6
Refactor HCCLCommContext to be compatible with Paddle (#31359)
4 years ago
Leo Chen 8497e2aad3
[NPU] add npu kernel for elementwise_add_grad (#31347)
4 years ago
lw921014 9fcdaeba5e
add allreduce and broadcast without test (#31024)
4 years ago
Leo Chen 5618f14047
fix reading flags from env (#31329)
4 years ago
liym27 a1ddff81e3
[NPU] Support npu op: (1) slice (2) slice_grad (#31275)
4 years ago
Leo Chen d23bf89cf6
support list of list attribute for NPU (#31299)
4 years ago
liym27 77a0c41cb2
Fix pow npu fp16 test (#31256)
4 years ago
liym27 187248f568
[NPU] Support npu op pow and pow grad (#31247)
4 years ago
xiayanming 821c2f4ef8
add ascend unittest (#31249)
4 years ago
Leo Chen d45f5d787e
Fix typo of selected_npus (#31230)
4 years ago
xiayanming 387c1db4f1
Ascendrc (#31065)
4 years ago
Leo Chen ff4654e216
refactor npu device manager (#31154)
4 years ago
liym27 1435b4c096
[NPU] Support executor with NPU (#31057)
4 years ago
Leo Chen 678a3e8fed
support adding correct npu op in pybind.h (#31143)
4 years ago
Leo Chen 85cbd55648
Fix compilation problem (#31100)
4 years ago
Leo Chen 5cb20f30fc
add npu kernel for elementwise_sub and elementwise_sub_grad (#30973)
4 years ago
gongweibao c687edecd8
Fix reshape on GE graph. (#31084)
4 years ago
xiayanming a6edbc478b
support parsing ascend rank table file (#31000)
4 years ago
Leo Chen 1201cd2ef2
[feature] support npu allocator, part 2 (#30972)
4 years ago
Leo Chen 7e049108c5
[feature] support npu operator (#30951)
4 years ago
Leo Chen 81138239db
[feature] support npu allocator (#30840)
4 years ago
gongweibao ebef6601d5
Destroy session first. (#30954)
4 years ago
Leo Chen 500f28ec37
pass cxx_flags to gloo cmake (#30857)
4 years ago
gongweibao de42d19336
Add paddle ascend distribution training supported (#30796)
4 years ago
OleNet ebb5d181e8
Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug (#30797)
4 years ago
dingsiyu 4a26729540
Merge ascend_optimizer and ascend_parser. (#30776)
4 years ago
gongweibao 636fefd9f8
code style (#30781)
4 years ago
Leo Chen 88dfd067bf
Dev/fix ascend string (#30749)
4 years ago
Leo Chen 6eabbc8076
fix compilation on ascend-20.1 (#30722)
4 years ago
Void Main 904cc44349
[Feature] Build parser to support distributed training (#30658)
4 years ago
gongweibao 5b77b259d8
cleanup (#30646)
4 years ago
gongweibao 7158061a29
Add startup bash files of test_ascend_group. (#30645)
4 years ago
gongweibao e4287ca60b
Add Hccl program group (#30642)
4 years ago
gongweibao f5aca8fbb4
Pass device_ids info from launch to trainer. (#30632)
4 years ago
Void Main d2404da768
Build praser for Hcom* operators (#30627)
4 years ago
gongweibao f9c97dd728
Add distribution supported (#30578)
4 years ago
gongweibao 1882f2ce2d
Fix compilcation on CANN20.1 and older (#30494)
4 years ago
hutuxian 6dd52c5b25
Ascend rc (#30483)
4 years ago

@ -31,9 +31,17 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
if (WITH_GPU AND WITH_ASCEND)
message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
@ -57,6 +65,10 @@ if(WITH_MUSL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
endif()
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)

@ -78,6 +78,14 @@ if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS)
endif()
if(WITH_ASCEND)
add_definitions(-DPADDLE_WITH_ASCEND)
endif()
if(WITH_ASCEND_CL)
add_definitions(-DPADDLE_WITH_ASCEND_CL)
endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)

@ -0,0 +1,84 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#NOTE: Logic is from
# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
if(DEFINED ENV{ASCEND_CUSTOM_PATH})
set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
else()
set(ASCEND_DIR /usr/local/Ascend)
endif()
if(WITH_ASCEND)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
endif()
if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
endif()

@ -53,6 +53,7 @@ ExternalProject_Add(
"${CRYPTOPP_DOWNLOAD_CMD}"
PREFIX ${CRYPTOPP_PREFIX_DIR}
SOURCE_DIR ${CRYPTOPP_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND
COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"

@ -42,7 +42,7 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build
&& cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
&& cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
&& mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"

@ -198,8 +198,19 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
"-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
ENDIF()
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
else()
SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
SET(PROTOBUF_TAG v3.8.0)
# SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
# SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
endif()
cache_third_party(${TARGET_NAME}
REPOSITORY ${PROTOBUF_REPOSITORY}
@ -234,7 +245,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1.0)
SET(PROTOBUF_VERSION 3.8.0)
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)

@ -16,7 +16,7 @@ INCLUDE(ExternalProject)
SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
cache_third_party(extern_threadpool

@ -17,8 +17,9 @@ INCLUDE(ExternalProject)
SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git)
set(WARPCTC_TAG 95a461eddeabd51099ef059dcfada1117eb1bfb8)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
@ -52,7 +53,7 @@ ExternalProject_Add(
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}

@ -151,6 +151,8 @@ set(COMMON_FLAGS
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0 # Warning in tinyformat.h
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
-Wno-error=nonnull-compare # Warning in boost gcc 7.2
-Wno-error=address # Warning in boost gcc 7.2
${fsanitize}
)

@ -440,9 +440,20 @@ function(cc_test TARGET_NAME)
cc_test_build(${TARGET_NAME}
SRCS ${cc_test_SRCS}
DEPS ${cc_test_DEPS})
cc_test_run(${TARGET_NAME}
COMMAND ${TARGET_NAME}
ARGS ${cc_test_ARGS})
# we dont test hcom op, because it need complex configuration
# with more than one machine
if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
cc_test_run(${TARGET_NAME}
COMMAND ${TARGET_NAME}
ARGS ${cc_test_ARGS})
endif()
endif()
endfunction(cc_test)
@ -859,7 +870,7 @@ function(py_test TARGET_NAME)
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
if (WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
endif()

@ -11,12 +11,16 @@ function(op_library TARGET)
set(miopen_hip_cc_srcs)
set(cu_cc_srcs)
set(xpu_cc_srcs)
set(npu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(cudnn_cu_srcs)
set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
if (WITH_ASCEND_CL)
set(op_common_deps ${op_common_deps} npu_op_runner)
endif()
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
set(options UNITY)
set(oneValueArgs "")
@ -84,6 +88,12 @@ function(op_library TARGET)
list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif()
endif()
if(WITH_ASCEND_CL)
string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$")
@ -106,6 +116,8 @@ function(op_library TARGET)
list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
list(APPEND npu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
@ -170,7 +182,7 @@ function(op_library TARGET)
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
if(WITH_UNITY_BUILD AND op_library_UNITY)
# Combine the cc source files.
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
if(TARGET ${UNITY_TARGET})
# If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@ -181,7 +193,7 @@ function(op_library TARGET)
# Add alias library to handle dependencies.
add_library(${TARGET} ALIAS ${UNITY_TARGET})
else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
endif()
@ -201,6 +213,7 @@ function(op_library TARGET)
# The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
set(ORIGINAL_TARGET ${TARGET})
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
# [ \t\r\n]* is used for blank characters
@ -230,10 +243,11 @@ function(op_library TARGET)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH npu_cc_srcs npu_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
@ -273,6 +287,26 @@ function(op_library TARGET)
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()
if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
# It is different from the logic above, becareful
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
# [ \t\r\n]* is used for blank characters
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
if (one_npu_register STREQUAL "")
string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
# [ \t\r\n]+ is used for blank characters.
# Here we use '+' instead of '*' since it is a REPLACE operation.
string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
endif()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
@ -323,6 +357,7 @@ function(register_operators)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE "_npu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len)

@ -274,6 +274,16 @@ if(WITH_BOX_PS)
list(APPEND third_party_deps extern_box_ps)
endif(WITH_BOX_PS)
if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend)
if(WITH_ASCEND)
list(APPEND third_party_deps extern_ascend)
endif()
if(WITH_ASCEND_CL)
list(APPEND third_party_deps extern_ascend_cl)
endif()
endif ()
if (WITH_PSCORE)
include(external/snappy)
list(APPEND third_party_deps extern_snappy)

@ -28,6 +28,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
@ -451,7 +452,7 @@ class HeterBoxWorker : public HogwildWorker {
virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program);
}
virtual void ProduceTasks() override;
void ProduceTasks() override;
virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
virtual void TrainFilesWithProfiler() {}
@ -550,7 +551,7 @@ class PSGPUWorker : public HogwildWorker {
virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program);
}
virtual void ProduceTasks() override;
void ProduceTasks() override;
virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
virtual void TrainFilesWithProfiler() {}
@ -633,7 +634,7 @@ class PSGPUWorker : public HogwildWorker {
};
#endif
#if defined(PADDLE_WITH_NCCL)
#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL)
class SectionWorker : public DeviceWorker {
public:
SectionWorker() {}
@ -654,6 +655,9 @@ class SectionWorker : public DeviceWorker {
void SetDeviceIndex(int tid) override {}
void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
void SetMicrobatchNum(int num) { num_microbatches_ = num; }
void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; }
void SetPipelineStage(int stage) { pipeline_stage_ = stage; }
void SetScheduleMode(int mode) { schedule_mode_ = mode; }
void SetMicrobatchScopes(const std::vector<Scope*>& scope) {
microbatch_scopes_ = scope;
}
@ -661,11 +665,23 @@ class SectionWorker : public DeviceWorker {
void SetSkipVars(const std::vector<std::string>& skip_vars) {
skip_vars_ = skip_vars;
}
void RunBackward(
int micro_id, std::unique_ptr<GarbageCollector>&,
std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
void RunForward(
int micro_id, std::unique_ptr<GarbageCollector>&,
std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
void RunUpdate(
std::unique_ptr<GarbageCollector>&,
std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
protected:
int section_id_;
int thread_id_;
int num_microbatches_;
int num_pipeline_stages_;
int pipeline_stage_;
int schedule_mode_; // 0 for GPipe and 1 for deepspeed
std::vector<Scope*> microbatch_scopes_;
std::vector<std::string> skip_vars_;
const Scope* minibatch_scope_;

@ -76,7 +76,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
#endif
#if defined(PADDLE_WITH_NCCL)
#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif
} // namespace framework

@ -32,6 +32,14 @@ message ShardingConfig {
optional float fuse_broadcast_MB = 1 [ default = 32.0 ];
optional bool hybrid_dp = 2 [ default = false ];
optional int32 sharding_group_size = 3 [ default = 8 ];
optional bool as_outer_parallelism = 4 [ default = false ];
optional int32 parallelism = 5 [ default = 1 ];
optional bool use_pipeline = 6 [ default = false ];
optional int32 acc_steps = 7 [ default = 1 ];
optional int32 schedule_mode = 8 [ default = 0 ];
optional int32 pp_bz = 9 [ default = 1 ];
optional bool pp_allreduce_in_optimize = 10 [ default = false ];
optional bool optimize_offload = 11 [ default = false ];
}
message AMPConfig {
@ -44,6 +52,8 @@ message AMPConfig {
repeated string custom_white_list = 7;
repeated string custom_black_list = 8;
repeated string custom_black_varnames = 9;
optional bool use_pure_fp16 = 10 [ default = false ];
optional bool use_fp16_guard = 11 [ default = true ];
}
message LocalSGDConfig {
@ -117,6 +127,8 @@ message AsyncConfig {
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
message ModelParallelConfig { optional int32 parallelism = 1 [ default = 1 ]; }
message DistributedStrategy {
// bool options
optional Mode mode = 1 [ default = COLLECTIVE ];
@ -140,12 +152,13 @@ message DistributedStrategy {
optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
optional bool cudnn_exhaustive_search = 21 [ default = true ];
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional int32 conv_workspace_size_limit = 22 [ default = 512 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional bool adaptive_localsgd = 24 [ default = false ];
optional bool fp16_allreduce = 25 [ default = false ];
optional bool sharding = 26 [ default = false ];
optional float last_comm_group_size_MB = 27 [ default = 1 ];
optional bool model_parallel = 28 [ default = false ];
optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
@ -158,6 +171,7 @@ message DistributedStrategy {
optional LambConfig lamb_configs = 109;
optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
optional ShardingConfig sharding_configs = 111;
optional ModelParallelConfig model_parallel_configs = 112;
optional BuildStrategy build_strategy = 201;
optional ExecutionStrategy execution_strategy = 202;
}

@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::NPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::NPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA
::DLContext ctx;

@ -466,6 +466,25 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
}
#else
PADDLE_THROW(
platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
#endif
}
}

@ -31,3 +31,7 @@ endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
if(WITH_ASCEND)
cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
endif(WITH_ASCEND)

@ -0,0 +1,22 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND
#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
namespace paddle {
namespace framework {
std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
} // end namespace framework
} // end namespace paddle
#endif

@ -0,0 +1,208 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND
#include <glog/logging.h>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
#include "ge/ge_api.h"
#include "ge/ge_api_types.h"
#include "graph/attr_value.h"
#include "graph/tensor.h"
#include "graph/types.h"
namespace paddle {
namespace framework {
typedef ge::Graph AscendGraphDesc;
#ifdef PADDLE_WITH_ASCEND_STRING
using AscendString = ge::AscendString;
#else
using AscendString = std::string;
#endif
class AscendInstance {
public:
virtual ~AscendInstance() {}
AscendInstance() {}
std::map<AscendString, AscendString> _GetDefaultInitOptions() {
std::map<AscendString, AscendString> init_options;
init_options["ge.exec.deviceId"] = "0";
init_options["ge.graphRunMode"] = "1";
return init_options;
}
std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
std::map<AscendString, AscendString> init_options;
// init_options["a"] = "b";
// init_options["ge.trainFlag"] = "1";
return init_options;
}
ge::Status InitGEForUT() {
return ge::GEInitialize(_GetDefaultInitOptions());
}
void InitGlobalResouces() {
LOG(INFO) << "Begin ascend InitGlobalResouces";
session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
if (session_ == nullptr) {
LOG(FATAL) << "new session error:" << session_;
}
LOG(INFO) << "End ascend InitGlobalResouces";
}
void DestroyGlobalResouces() {
LOG(INFO) << "Begin ascend DestroyGlobalResouces";
session_ = nullptr;
LOG(INFO) << "Begin ascend DestroyGlobalResouces";
}
static std::shared_ptr<AscendInstance> GetInstance() {
if (nullptr == ascend_instance_) {
ascend_instance_.reset(new paddle::framework::AscendInstance());
VLOG(1) << "Initialize AscendInstance Done";
}
return ascend_instance_;
}
void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
ge::Status status = session_->AddGraph(graph_idx, graph);
PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
paddle::platform::errors::PreconditionNotMet(
"Calling addGraph of graph engine failed, please "
"check Ascend Log."));
VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
}
ge::DataType VarTypeToGeType(proto::VarType::Type type) {
if (type == proto::VarType::FP16) {
return ge::DataType::DT_FLOAT16;
} else if (type == proto::VarType::FP32) {
return ge::DataType::DT_FLOAT;
} else if (type == proto::VarType::FP64) {
return ge::DataType::DT_DOUBLE;
} else if (type == proto::VarType::INT32) {
return ge::DataType::DT_INT32;
} else if (type == proto::VarType::INT64) {
return ge::DataType::DT_INT64;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Not support %s as tensor type.", DataTypeToString(type)));
}
}
int GeTypeSize(proto::VarType::Type type) {
if (type == proto::VarType::FP16) {
return 2;
} else if (type == proto::VarType::FP32) {
return 4;
} else if (type == proto::VarType::FP64) {
return 8;
} else if (type == proto::VarType::INT32) {
return 4;
} else if (type == proto::VarType::INT64) {
return 8;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Not support %s as tensor type.", DataTypeToString(type)));
}
}
ge::Tensor ConvertToGeTensor(const Tensor *tensor) {
auto numel = tensor->numel();
std::vector<int64_t> vec_dim;
auto dimen = arity(tensor->dims());
for (auto i = 0; i < dimen; ++i) {
vec_dim.push_back(tensor->dims()[i]);
}
// For Debug
// VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
// ", and shape is";
// for (const auto e : vec_dim) {
// VLOG(0) << e;
// }
ge::Shape shape(vec_dim);
ge::TensorDesc tensor_desc(shape, ge::Format::FORMAT_ND,
VarTypeToGeType(tensor->type()));
tensor_desc.SetRealDimCnt(vec_dim.size());
const uint8_t *data =
reinterpret_cast<const uint8_t *>(tensor->data<void>());
std::vector<uint8_t> dst(numel * GeTypeSize(tensor->type()));
memcpy(dst.data(), data, GeTypeSize(tensor->type()) * numel);
ge::Tensor ge_tensor(tensor_desc, dst);
return ge_tensor;
}
void RunAscendSubgraph(int graph_idx,
const std::vector<const Tensor *> &inputs,
std::vector<Tensor *> *outputs) {
VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
// Convert paddle Tensor to GE Tensor
std::vector<ge::Tensor> ge_inputs;
for (const auto &e : inputs) {
ge_inputs.push_back(ConvertToGeTensor(e));
}
// Run Graph
std::vector<ge::Tensor> ge_outputs;
ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
paddle::platform::errors::PreconditionNotMet(
"Calling RunGraph of graph engine failed, please "
"check Ascend Log."));
VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
// change tensor back, note all tensor's type computed in GE is uint8
for (size_t i = 0; i < ge_outputs.size(); ++i) {
const uint8_t *ret_data = ge_outputs[i].GetData();
size_t size = ge_outputs[i].GetSize();
VLOG(1) << "GE Tensor size of the " << i << "th output var is " << size;
auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
platform::CPUPlace());
memcpy(dst, ret_data, size);
// Following for debug:
// VLOG(0) << "output for " << i << " var: ";
// float *tmp = reinterpret_cast<float*>(dst);
// for (size_t j = 0; j < size / 4; ++j) {
// printf("%f ", tmp[j]);
// }
// printf("\n");
}
}
protected:
std::shared_ptr<ge::Session> session_;
private:
static std::shared_ptr<AscendInstance> ascend_instance_;
};
} // namespace framework
} // namespace paddle
#endif

@ -89,7 +89,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
: GarbageCollector(place, max_memory_size) {
platform::CUDADeviceGuard guard(place.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
callback_manager_.reset(new platform::StreamCallbackManager(stream_));
callback_manager_.reset(
new platform::StreamCallbackManager<cudaStream_t>(stream_));
}
StreamGarbageCollector::~StreamGarbageCollector() {
@ -118,6 +119,32 @@ void CUDAPinnedGarbageCollector::ClearCallback(
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
const platform::NPUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void NPUDefaultStreamGarbageCollector::Wait() const {
static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
void NPUDefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
const platform::NPUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void NPUUnsafeFastGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback();
}
#endif
int64_t GetEagerDeletionThreshold() {
return FLAGS_eager_delete_tensor_gb < 0
? -1

@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector {
private:
cudaStream_t stream_;
std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
std::unique_ptr<platform::StreamCallbackManager<cudaStream_t>>
callback_manager_;
};
class CUDAPinnedGarbageCollector : public GarbageCollector {
@ -130,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUDefaultStreamGarbageCollector : public GarbageCollector {
public:
NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
size_t max_memory_size);
void Wait() const override;
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class NPUUnsafeFastGarbageCollector : public GarbageCollector {
public:
NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
#endif
template <typename Container>
void GarbageCollector::Add(Container &&objs) {
Add(std::forward<Container>(objs), []() {});

@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
return LibraryType::kPlain;
} else if (s == std::string("XPU")) {
return LibraryType::kPlain;
} else if (s == std::string("NPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) {
return LibraryType::kPlain;
} else {

@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \
customized_type_value, \
@ -340,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/**
* Macro to mark what Operator and Kernel
* we will use and tell the compiler to

@ -212,6 +212,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_npu_place(place)) {
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with NPU support.",
place));
#else
auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
platform::SetNPUDeviceId(dev_id);
#endif
}
@ -1265,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound(

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save