Compare commits

...

460 Commits

Author SHA1 Message Date
wuhuanzhou 587d99ae44
update compilation with C++14 (#31815)
4 years ago
tianshuo78520a b09c1ce09a
fix whl package push pypi (#31585)
4 years ago
Thunderbrook 393b3bd6b7
fix split core (#31892)
4 years ago
wuhuanzhou 3a95a0bc26
update cmake minimum version to 3.15 (#31807)
4 years ago
taixiurong 52b05baca3
fix some bug in transformer training in xpu (#31918)
4 years ago
Wenyu 5394194e3a
support minus-int idx to LayerList (#31750)
4 years ago
furnace ef8323d49e
[ROCM] Add ROCm support for warpctc op (#31817)
4 years ago
Jiawei Wang 95f808c878
fix stack op grad nullptr (#31962)
4 years ago
liym27 57d4288ad4
[dynamic setitem] Fix bug of dynamic setitem: Decerease axes to do right broadcast (#31960)
4 years ago
石晓伟 0fa6c8a35c
fix a syntax error, test=develop (#31930)
4 years ago
Pei Yang 98e803e04f
map_matmul_to_mul_pass support 3dim (#31958)
4 years ago
wuhuanzhou a37a7f67e1
modify CI recommend information (#31395)
4 years ago
jakpiase 6dca7a1de7
Added int8 kernel for oneDNN LSTM op (#31894)
4 years ago
Pei Yang 14b7e3cf06
[Paddle-TRT] TRT inference support for BERT/Transformer in paddle 2.0 api (#31744)
4 years ago
Zhou Wei 245252b86e
fix bug when dtype of to_tensor is core.VarType (#31931)
4 years ago
Zhen Wang e1f931610e
Fix save/load error in imperative qat UT. (#31937)
4 years ago
Yiqun Liu e50bc2c2a6
Enhance cmake to support specifying CUDA_ARCH_NAME to Ampere. (#31923)
4 years ago
Zhou Wei 04a49b097e
[Custom OP]Remove old custom OP and reduce whl package volume (#31813)
4 years ago
wangguanzhong fe2848686b
add exclusive for test_conv2d_op, test=develop (#31936)
4 years ago
chajchaj 73a6fa3ed0
add deprecated for softmax_with_cross_entropy (#31722)
4 years ago
Shang Zhizhou 8084b7594b
fix batchnorm when inpu dims < 3 (#31933)
4 years ago
zlsh80826 64ee255ffd
[Paddle-TRT] yolobox (#31755)
4 years ago
Aurelius84 c4b60efabd
Fix segment Fault from set_value (#31891)
4 years ago
wuhuanzhou 17030ff28b
fix op benchmark ci error caused by missing test_pr branch, test=document_fix (#31920)
4 years ago
niuliling123 a71d72d921
relu forward and backward with vectortype (#31869)
4 years ago
tianshuo78520a 8829a309fe
Delete cudnn6 code (#31835)
4 years ago
wanghuancoder b48841ba2e
modify API nn.Bilinear's doc (#31889)
4 years ago
liym27 525c32e33c
Fix bug of set_value op:Decerease axes to do right broadcast (#31875)
4 years ago
ronnywang 123949eb48
[ROCM] added a cudnn switch of conv2d for rocm platform (#31836)
4 years ago
Shang Zhizhou 61805d8f0a
fix cmake model path (#31866)
4 years ago
Jiabin Yang 51eb29de18
[CustomOP] Add shape related constructor for Tensor (#31681)
4 years ago
zlsh80826 e3a38d790a
[Paddle-TRT] roi_align_plugin (#31732)
4 years ago
zlsh80826 bfb5cf5567
[Paddle-TRT] trt affine channel converter (#31628)
4 years ago
cc b47478efc2
[dygraph qat] Use layer to calculate output scale (#31861)
4 years ago
lilong12 c3974d0e2a
[3D-parallel] Reformat pipeline parallel (#31786)
4 years ago
zlsh80826 01aa252624
[Paddle-TRT] multiclass nms (#31742)
4 years ago
Wilber 70b67f1029
fix go api bug. (#31857)
4 years ago
tianshuo78520a e804f08559
delete include framework.pb.h (#31859)
4 years ago
Chengmo f58cb01864
【Paddle.Fleet】fix dataset zip py3 bug (#31441)
4 years ago
Kaipeng Deng bf09dcb346
add GPU tensor notice & update default_collate_fn/default_convert_fn. test=develop (#31763)
4 years ago
Chen Weihang 27f2d8df8e
Polish two error messages (#31852)
4 years ago
Zhou Wei 511e204e62
LRScheduler.get_lr should not update lr in LinearWarmup (#31843)
4 years ago
niuliling123 6472d62093
Revert "add relu forward kernel and backward kernel (#31613)" (#31853)
4 years ago
winter-wang e7f28d6c0d
fix runtime crash when rnn model inference, test=develop (#31833)
4 years ago
parap1uie-s 5d89ec36dc
Update pooling.py (#31829)
4 years ago
Huihuang Zheng 649868ffb2
[Dy2stat] Fix the bug that loop_body_func may return single element (#31806)
4 years ago
Wojciech Uss e5f7a834d4
fix cache key in concat oneDNN kernel (#31820)
4 years ago
Aurelius84 f2cfc0f46d
[CustomOp]Avoid raising warning while import paddle (#31804)
4 years ago
cc 84a551380e
[dygraph qat] Refine saving output scale to infer program (#31784)
4 years ago
Chen Weihang 68497e7b39
change trainable to stop_gradient in optimizer (#31823)
4 years ago
ronnywang 270699e647
[ROCM] fix test_matmul_v2_op (#31802)
4 years ago
Zhou Wei 1eb927f935
Restore the third-party library cache for windows (#31811)
4 years ago
Chen Weihang 3f66e7deab
add cmath header for bfloat (#31792)
4 years ago
Feiyu Chan 4046f1303a
add coalesce_tensor into white list when checking re-creation of parameters (#31800)
4 years ago
Zhou Wei a70de87d76
Update windows compiler and CI from VS2015 to VS2017 (#31652)
4 years ago
Wilber f4d9212de2
trt plugin upgrade to pluginv2ext (#31670)
4 years ago
niuliling123 372ac08a17
add relu forward kernel and backward kernel (#31613)
4 years ago
Wojciech Uss 814b38e30f
update scale collection and propagation algorithm (#31783)
4 years ago
tianshuo78520a 513641e153
Delete fast_check_nan_inf (#31788)
4 years ago
Shang Zhizhou 9d04ef7369
fix tensorrt output varible reshape (#31733)
4 years ago
Qi Li 46dd1d4aad
[ROCM] fix reduce_sum nan in ROCM platform, test=develop (#31780)
4 years ago
gongweibao f72d197ec5
fix launch ps ut test=develop (#31771)
4 years ago
Tao Luo 032de0bfd0
update approval (#31782)
4 years ago
zlsh80826 bfced39eb6
[Paddle-TRT] nearest_interp op (#31626)
4 years ago
arlesniak 7ccf6b6030
[oneDNN] Initial bf16 amp integration (#31093)
4 years ago
lilong12 a501a7b0ca
[3D-parallel] add 1f1b scheduler for pipeline (#31566)
4 years ago
guofei ed7956a816
Fix skip_quant in QAT (#31704)
4 years ago
ronnywang 8c19d7aa2f
[ROCM] fix test_conv2d_transpose_op (#31749)
4 years ago
Ouyang Chao a45c8ca69d
fix bug of DepthwiseConvTransposeGradKernel (#31762)
4 years ago
Jacek Czaja 25fc2a1fdb
[oneDNN] Added Elementwise Mul grad fp32/bf16 (#31647)
4 years ago
Chen Weihang 878e117b6d
[CustomOp] Support float16 in custom op (#31725)
4 years ago
ronnywang c9e1d9dc31
[ROCM] fix test_rnn_op (#31735)
4 years ago
zlsh80826 1c67cf0c98
run radix sort of proposals layer on context stream (#31631)
4 years ago
Chen Weihang e429deb0c4
[CustomOp] Support attribute in infershape function (#31713)
4 years ago
Adam Osewski a4a2b77def
[oneDNN] lookup_table op with support for BF16 data type. (#31558)
4 years ago
zlsh80826 c86e771e94
NMS Performance Optimization (#31634)
4 years ago
zlsh80826 50cafa0b0c
remove redundant sync, set collect/dist kernel to context stream, sub_lod memcpy opt (#31641)
4 years ago
cc 1d197f6c97
[dgraph qat] Refine calculating output scale of dygraph qat (#31710)
4 years ago
ronnywang 420527f0d9
[ROCM] fix layer_norm, norm, p_norm, test_sequence_softmax_op, test_math_op_patch_var_base (#31709)
4 years ago
Chen Weihang 87852616aa
[CustomOp] Support complex dtype in custom op (#31657)
4 years ago
zlsh80826 fe241fd02f
[Paddle-TRT] gather converter (#31640)
4 years ago
zlsh80826 4ea3427865
[Paddle-TRT] support batch axis concatenation when using dynamic shape (#31627)
4 years ago
Zhou Wei d4282ea97e
fix multi cuda environment bug (#31694)
4 years ago
Chengmo 09482ddec4
【Paddle.Fleet】Fix one ps gradient clip (#31664)
4 years ago
Kaipeng Deng 740359edaf
remove useless import (#31700)
4 years ago
Zhang Ting 7f50bb7ec1
support NHWC for temporal_shift op (#31642)
4 years ago
liym27 402288ad65
In __getitem__, convert integers to int64 Tensor not int32 to be compatible with Lite(#31658)
4 years ago
Chen Weihang 2fbe9b097a
[CustomOp] Remove Eigen dependencies of float16 (#31669)
4 years ago
cc 19592d2b71
Refine dygraph qat, test=develop (#31680)
4 years ago
Zhou Wei 4c0c55bba1
support Geforce RTX 30+ GPU (#31529)
4 years ago
YUNSHEN XIE cdc5a55ac1
turn off added ut check on windows (#31660)
4 years ago
Qi Li d9b50f664f
[ROCM] update ci scripts and dockefile, test=develop (#31551)
4 years ago
YUNSHEN XIE 1a6e3b04cd
Second optimization of retry method (#31646)
4 years ago
wuhuanzhou 41e9ecfd1f
Optimize compilation with Ninja (#31449)
4 years ago
yiak c1b1ccfbf5
Update tinyformat.h (#31612)
4 years ago
gongweibao 9c624b16d5
Extend unittest time of (#31570)
4 years ago
YUNSHEN XIE 580442ceba
fix wget with no proxy on windows (#31505)
4 years ago
ronnywang da10c5cf8b
[ROCM] fix softmax_with_cross_entropy_op, test=develop (#31629)
4 years ago
LielinJiang 75433126df
Fix summary bug when calaculating output shape (#31549)
4 years ago
ShenLiang c3634c6b0a
fix amp bug of fleet (#31532)
4 years ago
Chen Weihang 027b574a0e
[CustomOp] Remove the dependence of the underlying data types on eigen (#31602)
4 years ago
WangXi 9066b74f58
c_gen_nccl_id add SocketServer to persit server (#31589)
4 years ago
Kaipeng Deng a32e8bf1e7
DataLoader supprot dict str (#31481)
4 years ago
Chen Weihang 30a627aaf3
Normalized function parameter writing (#31588)
4 years ago
Pei Yang cac9635a67
[Paddle-TRT] Fix engine key in trt int8 calibration (#31513)
4 years ago
Shang Zhizhou 50ac7dbfd0
Trt elementwise plugin serialize (#31587)
4 years ago
guofei ef0dd3efed
Support loading parameters from checkpoint to save quantized model (#31419)
4 years ago
whs da9dda5c9b
Make CreateProgramDesc more robust (#31543)
4 years ago
hong 99dcd66508
try to fix imperative orc unitest error; test=develop (#31568)
4 years ago
Qi Li 3d5aa9d10a
[ROCM] fix conv2d and conv3d op, test=develop (#31553)
4 years ago
YUNSHEN XIE f302bb4f8b
help timeout ut debug (#31500)
4 years ago
Chen Weihang 95cceb2dd7
[CustomOp] Support duplicable op input and output (#31535)
4 years ago
Aurelius84 def27bc801
[Dy2stat]Fix bug with static_convert_var_shape in locals scope (#31556)
4 years ago
YUNSHEN XIE 49c3d2a97b
modified show_ut_retry_result (#31528)
4 years ago
LielinJiang ac493f2c72
Update comments for API `RandomResizedCrop` (#31539)
4 years ago
lidanqing 0f1e7e3d52
[Bug fix] Different machine generate different binary file, remove md5 check (#31482)
4 years ago
jiangcheng 9ed6c895f1
optimize range op by place parameters on cpu rather than gpu, test=develop (#30811)
4 years ago
Thunderbrook 3789a69923
solve bug in heter mode (#31531)
4 years ago
chajchaj 6148b87f9d
add softmax_switch for softmax_with_cross_entropy_op, test=develop (#31428)
4 years ago
Aurelius84 f3959e9ddc
[save/load] Fix bug with input_spec=dict[InputSpec] in jit.save (#31517)
4 years ago
WangXi 83a2fb1f08
Add collective async wait op (#31463)
4 years ago
lilong12 0205e9f84e
remove the send/recv of tensor size (#31460)
4 years ago
Aurelius84 c8ae837d52
[CustomOp]Fix setup_install timeout (#31484)
4 years ago
furnace 910f377fa5
Bugfix rocm (#31490)
4 years ago
Qi Li 416e47edef
[ROCM] fix softmax with loss nan in HIP platform, test=develop (#31491)
4 years ago
Shang Zhizhou f57739be35
fix ernie_varlen when cutting head (#31497)
4 years ago
JamesLim 45c7d90564
Optimization of elementwise CUDA kernel (#30801)
4 years ago
YUNSHEN XIE 0b3c229606
Prec on mac (#31382)
4 years ago
Jacek Czaja 23d96cf221
[oneDNN] bumpup onednn 2.2 fixup version (#31473)
4 years ago
YUNSHEN XIE 390cebee15
Prec on windows exclude check_added_ut (#31372)
4 years ago
Zhou Wei 634a12b368
fix bug of windows chineses msvc (#31493)
4 years ago
wangguanzhong 43d6abf0a5
update conv2d, test=develop (#31480)
4 years ago
wangguanzhong 50af0c2cbb
fix roi_align, test=develop (#31479)
4 years ago
ronnywang e03e46730c
[ROCM] fix gather_op, sigmoid_cross_entropy_with_logits_op, test=develop (#31467)
4 years ago
Qi Li b85c8e03be
[ROCM] fix reduce op, test=develop (#31478)
4 years ago
Jacek Czaja 39a5424ed1
[oneDNN] elementwise add bf16 grad kernel with broadcasting (#31385)
4 years ago
石晓伟 5f6213217b
update zero_copy_tensor_test.cc for build of gcc485, test=develop (#31470)
4 years ago
Qi Li 133a914bd0
[ROCM] fix test_dist_op ci test, test=develop (#31468)
4 years ago
Qi Li f9377965c4
[ROCM] fix dropout and remove hipcub, test=develop (#31455)
4 years ago
Aurelius84 fadabbe9b0
[CustomOp] Automatically specify PADDLE_WITH_MKLDNN & Remove Interpreter argument (#31391)
4 years ago
Leo Chen ffdd5b7773
Fix cmake of cryptopp to avoid downloading every time (#31447)
4 years ago
石晓伟 bc7632be73
upgrade inference tensor apis, test=develop (#31402)
4 years ago
JamesLim 8491ae9a02
Creating a CUDA function to find the minimum value in warp or block (#31191)
4 years ago
Pei Yang 30717a6cbc
fix trt serialization on windows (#31438)
4 years ago
Pei Yang 1321c47950
add more info in trt engine serialization (#31434)
4 years ago
liuyuhui 9ebf05b003
[Kunlun]Multi xpu dygraph performance optimization , add distributed.spawn support for multi xpu and some bug-fixes (#31130)
4 years ago
Qi Li 4d647ec137
[ROCM] update fluid platform for rocm (part5), test=develop (#31315)
4 years ago
liym27 522c91ec67
[Dy2Stat] Remove gast.Index for compatibility of gast 0.4.0 (#31358)
4 years ago
YUNSHEN XIE 62289fccc0
fix python full coverage decrease issue (#31429)
4 years ago
Wilber c9a7bfec89
prepare remove grad script and update PADDLE_CI_INFERENCE pipeline (#31149)
4 years ago
Zhang Ting 7d95e598c1
support float16 for temporal_shift op (#31432)
4 years ago
YUNSHEN XIE 3a8ef10e09
fix modified_retry_method_only_win (#31404)
4 years ago
Zhang Ting dcce54ea76
improve performance of depthwise_conv2d (#31099)
4 years ago
wuhuanzhou 4d6d2db812
Windows system supports Ninja compilation (#31161)
4 years ago
liym27 0fff930667
Fix bug for set_value op when input dtype is not float32 (#31411)
4 years ago
Huihuang Zheng c40b98e068
Fix comment (#31424)
4 years ago
Huihuang Zheng 6bf02a1261
[Dy2stat] Fix Read-Only Attribute as while_loop Output (#31415)
4 years ago
jakpiase 5b4f8aac82
Added LSTM BF16 and fixed GRU BF16 (#31234)
4 years ago
Qi Li 7cdf6ea770
[ROCM] update fluid elementwise op for rocm (part10), test=develop (#31361)
4 years ago
Qi Li 84639b6193
[ROCM] update fluid operators for rocm (part3), test=develop (#31213)
4 years ago
Qi Li 3b9db17199
[ROCM] update fluid operators for rocm (part7), test=develop (#31307)
4 years ago
Qi Li db50fb6766
[ROCM] fix softmax with loss and update python scripts, test=develop (#31373)
4 years ago
Pei Yang 32211fe9c4
TRT conv2d converter support SAME padding (#31379)
4 years ago
Qi Li e312a1ff6e
[ROCM] update fluid operators for rocm (part9), test=develop (#31338)
4 years ago
Qi Li 6626c6a6ad
fix bert cu file compiler error, test=develop (#31389)
4 years ago
wuhuanzhou c1bc223695
compile with VS2017, test=develop (#31388)
4 years ago
Zhou Wei 13e4280f82
[Custom OP]polish doc of custom OP (#31369)
4 years ago
Qi Li 946dbdae8c
[ROCM] update fluid operators for rocm (part6), test=develop (#31301)
4 years ago
wangna11BD 1cbccfa594
Add attrs `deformable_groups` for deformable_conv API (#31335)
4 years ago
Shang Zhizhou 77c44e2f1b
change prelu plugin to tensorRT layer (#30210)
4 years ago
YUNSHEN XIE 353dd0cd98
Modified retry method on windows (#31363)
4 years ago
Qi Li 59940cb383
[ROCM] update fluid operators for rocm (part8), test=develop (#31309)
4 years ago
tangwei12 5d7a8b05f8
fix sycn training error (#31357)
4 years ago
Qi Li ec72f5b235
fix ELU output for nan, test=develop (#31132)
4 years ago
Qi Li 65bcaeb004
[ROCM] update fluid operators for rocm (part5), test=develop (#31258)
4 years ago
YUNSHEN XIE 2111d912d4
Decrease threshold for failed ut retry (#30903)
4 years ago
Pei Yang 2e9e3fad15
add n-d input support for trt scale converter (#31316)
4 years ago
Shang Zhizhou 6404c43814
support trt serialize when load model from memory (#31342)
4 years ago
chentianyu03 a2c0b60401
remove wlist_temp in wlist.json (#31356)
4 years ago
Gradie d79fdc3d62
lamb_op_xpu;test=kunlun (#31012)
4 years ago
danleifeng d1075df2e8
topo and memory performance for heterps (#30440)
4 years ago
Qi Li 72d99c5dcd
[ROCM] update fluid operators for rocm (part4), test=develop (#31225)
4 years ago
cucuzg 91635de390
opt matmul and matmul_v2 on kunlun, *test=kunlun (#31326)
4 years ago
Wilber e20234094c
Fix xpu compile and cipher symbol problem. (#31271)
4 years ago
wuhuanzhou 30858d8974
fix compilation errors for missing brpc header files, test=develop (#31325)
4 years ago
石晓伟 625482f752
inference modification for custom operator, test=develop (#31312)
4 years ago
wuhuanzhou a13f1d6930
optimize unity build (#31119)
4 years ago
jiangcheng 8f4ac6b525
optimize topk op through limit SortTopK kernel entrance, test=develop (#30403)
4 years ago
alncat bfb8a64234
updated conv bn fuse pass to make it compatible with latest batch_norm op (#31272)
4 years ago
Bin Lu a37658daff
Update transforms.py (#31252)
4 years ago
Chen Weihang 5610c1717e
fix dtype unmatched (#31305)
4 years ago
Chen Long f8bdb90917
fix readme test=document_fix (#31308)
4 years ago
Qi Li 9b016c7cb7
[ROCM] update fluid operators for rocm (part2), test=develop (#31211)
4 years ago
niuliling123 2fd999d979
Optimized the adaptive_avg_pool2d op when output_size == 1 (#31197)
4 years ago
Zhou Wei aebf223478
fix test_check_abi (#31288)
4 years ago
石晓伟 1da3280660
inference modification for custom operator, test=develop (#31283)
4 years ago
Zhou Wei cc89120a2c
[Custom OP]add MSVC compile check on Windows (#31265)
4 years ago
Zhou Wei af9066e89c
[Custom OP]add PD_THROW and PD_CHECK for User Error message (#31253)
4 years ago
石晓伟 8c94d8cb4c
[Custom OP] change the user header file format, test=develop (#31274)
4 years ago
Jiabin Yang 038ce70d69
[Custom OP] Support stream set on Custom Op (#31257)
4 years ago
Aurelius84 1dd40870fc
[Dy2Stat] Fix eval_if_exist_else_none bug (#31261)
4 years ago
pangyoki 6fafbdc39e
change np.int to int to fix paddle warning (#31221)
4 years ago
Jiabin Yang 0c38708a90
[Custom Op] Remove unsupport dtypes (#31232)
4 years ago
WangXi b8bce682e0
xpu support fuse allreduce (#31104)
4 years ago
Aurelius84 59b00e8c45
[CustomOP]Support Incremental compilation and Add Version management (#31228)
4 years ago
Chen Weihang 126633c50f
[CustomOp] Split build op marco & polish details (#31229)
4 years ago
Aurelius84 e8d24b546a
[CustomOp] Add Modeling with Custom op unittest (#31218)
4 years ago
tangwei12 903235945b
loglevel adjustment for distributed training (#31205)
4 years ago
Wilber c0bda9109f
fix xpu compile error. (#31223)
4 years ago
Qi Li 28b356b9a2
[ROCM] update fluid framework for rocm (part6), test=develop (#31015)
4 years ago
Qi Li c8fac5ee30
[ROCM] update fluid framework for rocm (part5), test=develop (#31014)
4 years ago
Qi Li 580447d019
[ROCM] update fluid framework for rocm (part4), test=develop (#31013)
4 years ago
Wilber 7d91974c91
enable lite ut. (#30890)
4 years ago
littletomatodonkey ad50fa710b
add int pad support for Pad1D/2D/3D (#31209)
4 years ago
Guanghua Yu d18c5e47f3
fix ignore_index check in softmax_with_cross_entropy (#31201)
4 years ago
chentianyu03 ca3b6bcf78
add cache for VariableWrapper (#30880)
4 years ago
wangchaochaohu f114c3f8ca
fix the branch of code choose (#31200)
4 years ago
joanna.wozna.intel d11602481c
Add bf16 gru model test (#31158)
4 years ago
jakpiase 2f1165342b
OneDNN hardswish integration (#30211)
4 years ago
Aurelius84 912022fa0c
[CustomOp]Add cpp_extension en doc (#31187)
4 years ago
Chen Weihang e8cdb49aa9
[CustomOp] Support attributes as func input in custom op (#31128)
4 years ago
Zhou Wei ffbf71359a
modify custom op dependent from paddle_framework to paddle_custom_op (#31195)
4 years ago
Leo Chen 0f1fde5102
fix the modification of set_expected_place (#31177)
4 years ago
lilong12 dc8dfba35b
align the default value of some configuration for fleet to that of single cards (#30740)
4 years ago
lilong12 a373aa7645
fix the bug in expand_v2 op (#30984)
4 years ago
Thunderbrook c4f279fe8d
support multi node in heterps (#31102)
4 years ago
liu zhengxi ae2be49f40
Add cublas_handle() to expose cublas_handle to ops (#31157)
4 years ago
Aurelius84 406f4a7513
[CustomOp] Support to specific extra_cflags and exctra_cuda_flags independently (#31059)
4 years ago
qingqing01 572cc8bd0f
Update doc for 2.0 API and some callback (#31180)
4 years ago
Pei Yang 00b09e86ac
[Paddle-TRT] support group_norm (#31040)
4 years ago
Chen Weihang c209751c8d
change test_multiprocess_reader_exception cmake (#31174)
4 years ago
YUNSHEN XIE 153121457f
fix ut timeout (#31061)
4 years ago
Chen Weihang 1ce96fa118
[CustomOp] Add new paddle custom op so (#31141)
4 years ago
tangwei12 ebbdf52557
fix entry (#31079)
4 years ago
Qi Li ee76ea72de
[ROCM] update fluid collective op for rocm, test=develop (#31075)
4 years ago
yaoxuefeng d8fa65a3a8
fix heter compile (#30518)
4 years ago
Aurelius84 dce2db4857
[CustomOp] Split build directory for each setup.py (#31124)
4 years ago
Zhou Wei 4b220550ef
[Custom OP]Fix problem of custom op unitests on Windows CI (#31114)
4 years ago
chentianyu03 70131b475f
add warning message when dtypes of operator are not same (#31136)
4 years ago
Zhou Wei be61c2d06b
support build whl and inference library nightly,test=windows3 (#30616)
4 years ago
alncat 5d6a8c7b73
added support for fake_quantize_dequantize_abs_max op in quantization… (#30896)
4 years ago
Chen Weihang e60fd1f6a8
[CustomOp] Split test and add inference test (#31078)
4 years ago
Jacek Czaja d3f09ad702
Update of onednn to 2.2 (#31067)
4 years ago
Guanghua Yu 24ba5ee05c
merge develop conflict (#31122)
4 years ago
xiemoyuan edacb6293c
Optimization of Transformer API (#30957)
4 years ago
WeiXin ee1801c1ad
Save load/save pickle protocol (#31044)
4 years ago
Qi Li cced930b61
[ROCM] update fluid operators for rocm (part1), test=develop (#31077)
4 years ago
yukavio 99fd9815b6
fix flops api (#31081)
4 years ago
wangchaochaohu 364cfa2686
fix windows for optimization of elementwise_add Op (#31068)
4 years ago
joanna.wozna.intel 781df300d0
Unification of BF16 enablement process (#31034)
4 years ago
Zhong Hui 16fe11d71e
fix softmax cross entropy integer overflow (#30590)
4 years ago
Zhou Wei 44ee251fde
fix UNIX cmake problem (#31113)
4 years ago
Qi Li a60d93fb77
[ROCM] update fluid framework for rocm (part2), test=develop (#31010)
4 years ago
Thunderbrook 565354f676
support save multi sparse table in one path (#31108)
4 years ago
Qi Li 50967135a5
[ROCM] update fluid framework for rocm (part3), test=develop (#31011)
4 years ago
Huihuang Zheng cf43a321a8
[Dy2stat] Refactoring tensor_shape_transformer.py to Fix Change after Assign Bug (#31082)
4 years ago
tangwei12 0e4b154298
fix dist fleet ctr ut (#31087)
4 years ago
Qi Li 8fe09faf14
[ROCM] update fluid framework for rocm (part1), test=develop (#31009)
4 years ago
Qi Li 334296306c
[ROCM] update fluid platform for rocm39 (part4), test=develop (#30936)
4 years ago
Shang Zhizhou a5c56d83a1
update trt int8 calibrator to IEntropyCalibratorV2 (#31060)
4 years ago
Zhou Wei adaec0073d
[2.0Custom OP]Support New Custom OP on Windows (#31063)
4 years ago
Chen Weihang 2168f08ac8
add optional for param attr args, test=document_fix (#31105)
4 years ago
Qi Li 1d996637e6
[ROCM] update fluid imperative for rocm (part1), test=develop (#31017)
4 years ago
JamesLim b95eb38b8a
fix the bug in backward OP of index_sample. (#31026)
4 years ago
Chengmo 6b3371e0c7
Remove PE special profiler (#30886)
4 years ago
Chen Weihang 6beeafe797
[CustomOp] Add more dispatch marco for users (#31058)
4 years ago
TTerror d5323dab41
add squeeze_op/unsqueeze_op on kunlun;fix conv op and parallel executor;optimize lookup_table op (#31056)
4 years ago
123malin 16b4260b2f
test=develop, save/load, shrink (#30625)
4 years ago
Shibo Tao 4424aac608
export paddle.static.normalize_program method. (#31072)
4 years ago
Jiabin Yang 628451af06
hide useless headers and add complex support (#31074)
4 years ago
Wilber 463eae0383
update paddle_fluid.so to paddle_inference.so (#30850)
4 years ago
tangwei12 a2170a0866
change fleet reviewer (#31069)
4 years ago
liym27 5b367dab44
[static setitem] Support the index is Tensor; step>1; step<0 .(#30949)
4 years ago
Qi Li eb3050fa9a
[ROCM] update fluid inference for rocm (part1), test=develop (#31018)
4 years ago
Jack Zhou 6df1ca54c8
add detail about states index in rnn result, test=document_fix (#31048)
4 years ago
Huihuang Zheng ef627ac5b9
Fix that convert_var_shape doesn't support slice like [0:], test=develop (#31051)
4 years ago
Jacek Czaja f7465641c3
Added reshape grad bf16 (#31035)
4 years ago
Aurelius84 4dbe16c48f
[CustomOp] Refine name argument in setup (#31049)
4 years ago
Aurelius84 f2dc29a9fa
[CustomOp] Support output dtypes in generated Python API (#31045)
4 years ago
Wojciech Uss 615d8a2264
Modify relu native implementation 2 (#30996)
4 years ago
ShenLiang 9401173e3a
Remove scale loss before reduce in dygraph (#30807)
4 years ago
Wilber 0020d91506
fix python pass builder error. (#30946)
4 years ago
Wilber 39aeaa160e
fix jetson problem (#30939)
4 years ago
Wilber 01ccfbcde9
update trt error message when input height or width is -1 (#31019)
4 years ago
Wilber cf8b8f9c5e
resolve memory leak in cudnn8.0 (#31029)
4 years ago
Kaipeng Deng c4ddc3ab0d
fix dataloader collate return list mix tensor and numpy array (#30904)
4 years ago
Guanghua Yu 5b267474a9
add offset parameter in roi_align,generate_proposals.etc ops (#30864)
4 years ago
Chen Weihang 75f81233ae
fix regex error & simplify marco name (#31031)
4 years ago
Zhang Ting f0ee159280
enable exhaustive_search for forward and backward algos when dtype is float16 (#30959)
4 years ago
Pei Yang 9b54fe4154
add trt transpose and flatten converter (#31022)
4 years ago
Aurelius84 4c9f96c902
[CustomOp] Support Compile multi ops at same time (#30920)
4 years ago
joanna.wozna.intel caf9d39839
Add Conv Transpose BF16 (#30877)
4 years ago
Huihuang Zheng cbbe127483
Refine fake_interface Error Message (#30981)
4 years ago
Huihuang Zheng c137578341
Add Support for Tuple in for Loop (#30998)
4 years ago
Wojciech Uss 2497f4392f
Handle missing symlink method on Windows (#31006)
4 years ago
Aurelius84 5653c3a488
[CustomOp] Check Compiler ABI compatibility (#30869)
4 years ago
huangjun12 20e300e2df
fix lrn bug in reshape size, test=develop (#30968)
4 years ago
WeiXin 8ab29f4bea
delay timeout of unnittest 'test_static_save_load'. (#30975)
4 years ago
Chen Weihang f649442ddd
New custom operator extension mechanism (#30690)
4 years ago
Zhou Wei 5c0332714f
fix bug of Linux UT parallel level (#30971)
4 years ago
chajchaj f5ca2db2cc
support label with float input of cross_entropy, test=develop (#30929)
4 years ago
pangyoki 52edaecc5d
modify dockerfile: support cuda11 and delete gcc8.2 in cpu version (#30746)
4 years ago
wuhuanzhou 9b3c80c8ab
update eigen version on Windows (#30573)
4 years ago
ShenLiang dae3e1f337
Solve inconsistent order in each card in dynamic graph (#30931)
4 years ago
WangXi 14d039e4a1
Fix the problem that the number of ops executed by xpu is wrong (#30961)
4 years ago
Huihuang Zheng 8e72e031fc
Update gast requirement, test=develop (#30932)
4 years ago
Chen Weihang 010f2caa23
try to fix reader and signal test failed (#30960)
4 years ago
Adam Osewski 3ba69809bf
Fix LayerNorm tester for gcc4.8 (#30962)
4 years ago
Qi Li 93c1d9e761
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913)
4 years ago
QingshuChen 15297a065c
fix depends of kunlun bkcl (#30945)
4 years ago
liym27 12c15bebe4
[Static setitem] Support index is ellipsis for setitem in static mode (#30836)
4 years ago
liym27 97f7a70c01
Add error message for slice op(#30851)
4 years ago
liuyuhui 87197f8c2e
[kunlun]fix sync in multi kunlun xpu dygraph training. (#30943)
4 years ago
wuhuanzhou 99bf6228b8
op benchmark ci retry with specfied id (#30743)
4 years ago
石晓伟 99bd16eb4e
bug fix of xpu lite engine, test=develop (#30918)
4 years ago
tianshuo78520a 2e93233899
Add WITH_XPU_BKCL in Kunlun-CI (#30919)
4 years ago
wanghuancoder 823f499a8a
fix a bug of Sequential::__getitem__ (#30899)
4 years ago
Qi Li 34f1628ce8
[ROCM] update fluid platform for rocm39 (part2), test=develop (#30774)
4 years ago
wanghuancoder 5ded39f226
fix cpplint cfg, test=develop (#30924)
4 years ago
Jacek Czaja 9e527d9956
[oneDNN] Added basic changes for elementwise_add_grad bf16 (#30925)
4 years ago
Chengmo c98f144fbc
add truncated gaussian random (#30922)
4 years ago
liuyuhui 4a8b8b4547
[Kunlun] add gen_bkcl_id_op, support multi XPU cards training using multiprocess (#30858)
4 years ago
liym27 39f41cb47f
Performance optimization for dynamic setitem: Call op set_value to speed up because the original call to TensorToPyArray will introduce unnecessary data copy. (#30817)
4 years ago
liuyuhui bef46ccfc8
[Kunlun]fix include files of gen_comm_id_helper.cc (#30917)
4 years ago
wanghuancoder 90d92111cf
let LayerList could add [None], test=develop (#30911)
4 years ago
wanghuancoder aab3a3012e
add include for heterbox_trainer.cc, develop=test (#30910)
4 years ago
taixiurong 24873f4f77
dyngraph (#30892)
4 years ago
Zhen Wang 71acde9afc
Use correct master weights in AdamW. (#30895)
4 years ago
LielinJiang 79fa8fb0df
rm test_datasets from file parallel_UT_relu.py (#30907)
4 years ago
Adam Osewski 092a2b1413
More UT for LayerNormFuse pass (#30891)
4 years ago
tianshuo78520a a80fe67f84
Change cmake/third_party files for CI (#30833)
4 years ago
Jacek Czaja abfa822650
[oneDNN]Extended adaptive pooling support for oneDNN pool kernel (#30757)
4 years ago
joanna.wozna.intel 73cdea01d4
Add bf16 fast performance verification (#30551)
4 years ago
Shang Zhizhou e6095bc2ce
fix split trt plugin initialize (#30875)
4 years ago
WangXi 6e3856d3fb
fix xpu dygraph place (#30868)
4 years ago
wanghuancoder 35c5b23f68
use iwyu clean include second time, test=develop (#30829)
4 years ago
Zhang Ting e97905c5fa
improve performance of momentum (#30881)
4 years ago
GT-Zhang 4b2d52a001
Update README.md (#30873)
4 years ago
fluffyrita 635e168c22
Update README_cn.md (#30867)
4 years ago
cucuzg ac2e2e6b7f
add clip_by_norm on kunlun, *test=kunlun (#30862)
4 years ago
Kaipeng Deng 302427170f
remove numpy array check in single-process dataloader. test=develop (#30861)
4 years ago
wawltor b7560a59ab
fix the broadcast for the large second input (#30818)
4 years ago
JamesLim 6e1e036a75
Implement cuda kernel for index_sample. (#30380)
4 years ago
AshburnLee 666efc2336
Call new cudnn batch norm API regardless of data type and data layout (#30157)
4 years ago
QingshuChen 5c8455d6ea
try again if kunlun memory malloc failed (#30855)
4 years ago
石晓伟 2ac4143b6c
support xpu with analysis predictor, test=develop (#30832)
4 years ago
joejiong 05d2b7a37f
Update paddle.static.Print with paddle2.0 api (#30846)
4 years ago
Aurelius84 e49d0746dd
[CustomOp] Support install as Package and Add load interface (#30798)
4 years ago
liuyuhui 2cb55eff57
fix WITH_XPU_BKCL in CMakeLists.txt (#30854)
4 years ago
Adam Osewski 4f066e316e
Layer normalization fuse pass. (#30721)
4 years ago
WangXi b1026f64af
【kunlun】dygraph supports multi xpu card training (#30671)
4 years ago
LielinJiang 3a3ff75c52
Fix unittest random failed of test_datasets (#30804)
4 years ago
joanna.wozna.intel 04532b8a83
Update Xbyak to v5.81 (#30809)
4 years ago
Shang Zhizhou b909450994
fix trt plugin clone and initialize bugs in TRT7.1+ (#30709)
4 years ago
Wilber b08ae368bb
ci compilation depends on a stable release (#30755)
4 years ago
Shang Zhizhou 200ee33df8
fix unittest random error (#30808)
4 years ago
xiemoyuan db87087283
Optimize the encoder of Transformer. (#30439)
4 years ago
Thunderbrook cb66c53c2d
dump to cpu (#30750)
4 years ago
Chengmo d3fac0ea85
fix int64 bug (#30780)
4 years ago
Qi Li 69875dc42c
[ROCM] update fluid memory for rocm35 (part1), test=develop (#30758)
4 years ago
tianshuo78520a 5b1ab51ca4
Change PR-CI-PY3 cc version (#30771)
4 years ago
QingshuChen c35a9880f9
fix malloc L3 failed bug for kunlun (#30745)
4 years ago
WangXi 31ed9c9eed
Fleet distributed strategy support pure fp16 (#30754)
4 years ago
Zhen Wang 53d01afed6
Fix the nan bug when passing all zero values into clip_by_norm_op. (#30777)
4 years ago
ShenLiang 3858f458ea
rm Singleton of reducer (#30775)
4 years ago
Aurelius84 2c974cc316
【CustomOp】support setup.py to compile custom op (#30753)
4 years ago
Jiaqi Liu 65a9744cfd
fix paddle.static.acc and auc sample code bug, test=document_fix (#30715)
4 years ago
Chen Long 3fa2e2c67c
update readme links (#30756)
4 years ago
Qi Li f89da4ab45
[ROCM] update fluid platform for rocm35 (part1), test=develop (#30639)
4 years ago
Wojciech Uss fc00240575
A fix for oneDNN matmul kernel. Fixes issue #30309 (#30723)
4 years ago
lidanqing 46989e889b
Fix python3 incompatibility issues (#30698)
4 years ago
tianshuo78520a a12b6bb9cb
add readme in whl package (#30726)
4 years ago
alncat 5b59499e57
fixed compilation error on gcc 4.8.x due to the usage of isfinite (#30733)
4 years ago
Chengmo 78d37c3f75
【Paddle.Fleet】Fix brpc get hostname (#30703)
4 years ago
WeiXin 3491acfb1e
Split unittest. (#30727)
4 years ago
taixiurong caf3680bbc
fix bugs in transformer predict in xpu place (#30730)
4 years ago
liu zhengxi a87d78f1a9
update gather_tree doc (#30693)
4 years ago
liu zhengxi fef3654b4e
upgrade gather_tree to core.ops (#30697)
4 years ago
jakpiase f8da5536ed
REUPLOAD Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30719)
4 years ago
liuyuhui 67abfc1588
[Kunlun] fix dead lock for exec_op_count_ (#30718)
4 years ago
liym27 13ef444fa6
[Dy2Stat] Fix error message when the message has more than one lines. (#30714)
4 years ago
alncat 5ace20fc3f
modified conv+bn fuse pass to fix wrong mask in mask rcnn (#30704)
4 years ago
Tao Luo 824a79d383
Revert "Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)" (#30708)
4 years ago
lilong12 7fbc68a2c0
update, test=develop (#30692)
4 years ago
jakpiase d834f4e6e8
Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)
4 years ago
Leo Chen 1a13626f5f
polish printing dtype (#30682)
4 years ago
arlesniak 5bf25d1e8b
More precise mkldnn kernel rules in GetExpectedKernelType (#29840)
4 years ago
WangXi a28a202603
fix test_gen_nccl_id_op failed (#30686)
4 years ago
123malin 164275704d
test=develop, fix nonzero astuple=true (#30647)
4 years ago
yingshengBD 0eea5d714f
post quantize support insert fake_quantize_dequantize node before the OPs that will be used in VIS's faceid models (#30659)
4 years ago
123malin 06a3e31148
test=develop, fix test_lookahead (#30677)
4 years ago
Qi Li 846ce40604
[ROCM] update eigen cmake and patch, test=develop (#30602)
4 years ago
Jacek Czaja 173660be7b
[oneDNN] Cache oneDNN stream not to recreate in each oneDNN op (#30358)
4 years ago
Shang Zhizhou ae0f88a988
add DLA support:C++&&Python api (#30165)
4 years ago
yukavio 8c5f158172
remove PrettyTable dependence from paddle.flops (#30675)
4 years ago
chentianyu03 fb7fbc7a5d
fix abs bug and add abs test case (#30637)
4 years ago
tianshuo78520a 37926611a6
clean dockerfile (#30650)
4 years ago
wuhuanzhou f400bd7084
set WITH_INFERENCE_API_TEST=ON on Windows with GPU (#30090)
4 years ago
石晓伟 39fac847cd
delete the lite meta info because of ccache, test=develop (#30644)
4 years ago
ShenLiang 9514b4aa5f
Fix scatter grad bug (#30604)
4 years ago
Qi Li 1f5841c2a0
[ROCM] update cmake and dockerfile, test=develop (#30598)
4 years ago
Pei Yang cf9bdb9404
extend trt ut timeout threshold (#30537)
4 years ago
Thunderbrook 1bebc09253
solve build gpu task core (#30626)
4 years ago
石晓伟 33bf6eb753
revert external gflags, test=develop (#30623)
4 years ago
Zhen Wang 4a9de931a2
Fix the bug in fleet amp_init. (#30606)
4 years ago
cnn 7e9f336b58
update document of paddle.vision.dataset, test=document (#30414)
4 years ago
Jacek Czaja dfdb0359ea
- Disabling oneDNN inplace pass (#30588)
4 years ago
guofei 430f8449f1
Fix the error of save_quantized_model (#30583)
4 years ago
TTerror 10271ddfc4
support reduce_max op on kunlun (#30581)
4 years ago
QingshuChen 5013c67644
fix softmax bug for multi_card in kunlun (#30600)
4 years ago
wuhuanzhou 7e671c07b6
optimize unity build (#30195)
4 years ago
liuyuhui e5b0d9e1fc
[Kunlun] Add condition_variable and notify() in BindThreadedSSAGraphExecutor (#30586)
4 years ago
WeiXin ca33821475
延长单测'test_static_save_load'超时 (#30599)
4 years ago
Zhou Wei 9674e440e2
optimize windows CI, clear tp cache,polish code,improve level of msvc log (#30579)
4 years ago
wanghuancoder 90773473a0
use nvtx push pop in timeline (#30567)
4 years ago
chentianyu03 358106fcb0
make abs op support complex types (#30375)
4 years ago
huangxu96 138620084c
Add fleet amp_init() (#30572)
4 years ago
Wilber 2d5758c456
update. (#30585)
4 years ago
wanghuancoder 27a5c0cff6
fix layers train eval bug (#30580)
4 years ago
lilong12 8126a41d73
fix the bug of all_reduce pipeline gradient multiple times (#30437)
4 years ago
Aurelius84 621bc4f771
[Dy2static]Fix paddle prefix in is_paddle_api (#30569)
4 years ago
Tao Luo 9dd71c74df
disable test_analyzer_detect (#30541)
4 years ago
tangwei12 c9e78a22c5
add trainers for pserver (#30523)
4 years ago
Aurelius84 5067e3a8d2
[Dy2Static]Enhance check of TracedLayers out vars (#30576)
4 years ago
wanghuancoder d1b25ed9d7
add some RecordEvent, for dygraph timeline (#30299)
4 years ago
YUNSHEN XIE bbea5a1fa9
The new unit test cannot have the same name as the existing unit test (#29878)
4 years ago
liym27 ff25c5b36f
Fix bug: GetAttrValue should deal with attr with attrType vector<double> (#30536)
4 years ago
WangXi 572c466d19
[Prepare for MultiProcess xpu] unified gen nccl id, refine imperative reducer (#30455)
4 years ago
ykkk2333 549855ac20
add rmsprop_op_xpu test=kunlun (#30493)
4 years ago
Zhou Wei fb20ec9a4e
fix bug of multicard grad ncclAllReduce (#30553)
4 years ago
Zhen Wang f30d00553a
Fix the compiling error of update_loss_scaling when using cuda9. (#30538)
4 years ago
Leo Chen 81217a94d8
unify calling cudaSetDevice (#30470)
4 years ago
pangyoki 00554b3f6b
fix error message of Inplace strategy (#30520)
4 years ago
QingshuChen d849ecc0ae
update kunlun dependence for aarch64 & sunway platform (#30516)
4 years ago
Leo Chen 7043b8cfc6
support layer_norm fp16 in dygraph amp (#30430)
4 years ago
wuhuanzhou 28eb7b6589
fix logs dir error with auto retry, test=document_fix (#30466)
4 years ago
Zhang Ting 66c514ce83
[2.0 API] device guard (#30307)
4 years ago
WangXi 7a0a576e51
fix adamw lr_to_coeff is fixed when dygraph (#30526)
4 years ago
wanghuancoder 59ad6ff3e3
delete empty line of pybing.cc, test=develop (#30529)
4 years ago
cc ce6777fcdf
Fix bug of supporting channelwise dygraph quantized model, test=develop (#30531)
4 years ago
WeiXin c0fb03a0dc
Supplement PR29988(https://github.com/PaddlePaddle/Paddle/pull/29988) (#30507)
4 years ago
hutuxian 9fec1618d2
Ascend Framework Part3: Ascend Parser (#30391)
4 years ago
hutuxian e207fe6385
Ascend Framework Part2: pybind files (#30410)
4 years ago
hutuxian 40ede12631
Ascend Framework Part1: OP & Wrapper (#30281)
4 years ago
Zhang Ting 34bf8dfc40
avoid calling cast twice (#30527)
4 years ago
gongweibao bdae7ed326
Fix potential port conflicts. (#30508)
4 years ago
liuyuhui 843dc3cdbd
[Kunlun]PR3: add xpu executor, multi xpu card train function optimization (#30317)
4 years ago
QingshuChen 8489d4f76f
optimize batch_norm & pool op for kunlun (#30490)
4 years ago
wanghuancoder bd97192274
if pybind.cc changed, generate total report, test=develop (#30514)
4 years ago
taixiurong 5e5c2827a3
fix range op crash in dygraph xpu place (#30469)
4 years ago
WeiXin 18ecd433f5
Avoid bug on 'MAC python3.5/6'. (#30485)
4 years ago
JZ-LIANG 16ba0abc79
Recompute Offload: fixed bug in memcpy (#30484)
4 years ago
lijianshe02 d8a9ba56ef
fix random seed in nll_loss unittest test=develop (#30468)
4 years ago
cc 5d8d463cf7
Collect weight threshold for lstm op in post_training_quantization (#28701)
4 years ago
guofei 11e78ebaa3
Modify the calculation logic of LambOptimizer (#29313)
4 years ago
Adam Osewski c5ffad126c
[oneDNN] Refactor fuse pass helper functions to one place. (#30460)
4 years ago
LielinJiang 1d7bf1de2b
Update voc dataset url (#30450)
4 years ago
Zhang Ting c9a334e1b3
add VecCastCUDAKernel (#30296)
4 years ago
pangyoki 13d757362c
Add Inplace strategy (Output reuse Input Varbase) in dygraph (#30103)
4 years ago
Yang Zhang 008b0a8b56
Fix float64 bug in layer norm (#30452)
4 years ago

@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License
cmake_minimum_required(VERSION 3.10)
cmake_minimum_required(VERSION 3.15)
cmake_policy(VERSION 3.10)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@ -31,13 +32,12 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
"You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
if (WITH_GPU AND WITH_ASCEND)
message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
endif()
if(WITH_GPU AND NOT APPLE)
@ -57,18 +57,31 @@ if(WITH_MUSL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
endif()
if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
endif()
if (MSVC_STATIC_CRT)
message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
@ -79,7 +92,9 @@ if(WIN32)
endif()
endforeach(flag_var)
endif()
# NOTE(Avin0323): Less parallel count result in faster compilation.
math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
# windows build turn off warnings, use parallel compiling.
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@ -87,13 +102,7 @@ if(WIN32)
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
# NOTE(Avin0323): Less parallel count result in faster compilation with
# Unity Build on GPU.
if(WITH_UNITY_BUILD AND WITH_GPU)
set(${flag_var} "${${flag_var}} /MP8")
else()
set(${flag_var} "${${flag_var}} /MP")
endif()
set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
@ -111,6 +120,10 @@ if(WIN32)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
endforeach(flag_var)
if (WITH_WIN_DUMP_DBG)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
@ -148,8 +161,9 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization and inference-lib generation" OFF)
################################ Internal Configurations #######################################
option(WITH_ROCM_PLATFORM "Compile PaddlePaddle with ROCM platform" OFF)
option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
option(WITH_RCCL "Compile PaddlePaddle with RCCL support" OFF)
option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(WITH_INCREMENTAL_COVERAGE "Generate coverage reports only for incremental code" OFF)
@ -278,19 +292,25 @@ include(configure) # add paddle env configuration
include_directories("${PADDLE_SOURCE_DIR}")
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
if(WITH_ROCM)
include(hip)
endif(WITH_ROCM)
if (NOT WITH_ROCM AND WITH_RCCL)
MESSAGE(WARNING
"Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.")
set(WITH_NCCL OFF CACHE STRING
"Disable RCCL when compiling without GPU" FORCE)
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
if(WITH_ROCM_PLATFORM)
find_package(HIP)
include(hip)
endif(WITH_ROCM_PLATFORM)
if(WITH_RCCL)
add_definitions("-DPADDLE_WITH_RCCL")
include(rccl)
else()
if(WITH_ROCM)
MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
endif()
endif()
if(WITH_NV_JETSON)
set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
@ -323,6 +343,8 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
add_definitions(-DPADDLE_DLL_EXPORT)
if(ON_INFER)
# you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
message(STATUS "On inference mode, will take place some specific optimization.")

@ -8,8 +8,8 @@
English | [简体中文](./README_cn.md)
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.8/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/index_cn.html)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -22,23 +22,21 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
## Installation
### Latest PaddlePaddle Release: [v1.8](https://github.com/PaddlePaddle/Paddle/tree/release/1.8)
### Latest PaddlePaddle Release: [v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0)
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
### Install Latest Stable Release:
```
# Linux CPU
# CPU
pip install paddlepaddle
# Linux GPU cuda10cudnn7
# GPU
pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.5.post97
```
It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
More infomation about installation, please view [Quick Install](https://www.paddlepaddle.org.cn/install/quick)
Now our developers can acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you will obtain 12 hours to train models online per day. If you can insist on that for five consecutive days, then you will receive an extra 48 hours. [Click here to start](http://ai.baidu.com/support/news?action=detail&id=981).
Now our developers can acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you will obtain 10 hours to train models online per day. [Click here to start](https://aistudio.baidu.com/aistudio/index).
## FOUR LEADING TECHNOLOGIES
@ -67,38 +65,30 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
## Documentation
We provide [English](http://www.paddlepaddle.org.cn/documentation/docs/en/1.8/beginners_guide/index_en.html) and
[Chinese](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/index_cn.html) documentation.
We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html) and
[Chinese](https://www.paddlepaddle.org.cn/documentation/docs/zh/guide/index_cn.html) documentation.
- [Basic Deep Learning Models](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)
- [Guides](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html)
You might want to start from how to implement deep learning basics with PaddlePaddle.
- [User Guides](https://www.paddlepaddle.org.cn/documentation/docs/en/user_guides/index_en.html)
You might have got the hang of Beginners Guide, and wish to model practical problems and build your original networks.
- [Advanced User Guides](https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/index_en.html)
- [Practice](https://www.paddlepaddle.org.cn/documentation/docs/zh/tutorial/index_cn.html)
So far you have already been familiar with Fluid. And the next step should be building a more efficient model or inventing your original Operator.
- [API Reference](https://www.paddlepaddle.org.cn/documentation/docs/en/api/index_en.html)
Our new API enables much shorter programs.
- [How to Contribute](https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/addon_development/contribute_code/index_en.html)
- [How to Contribute](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/08_contribution/index_en.html)
We appreciate your contributions!
## Communication
- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
- QQ discussion group: 796771754 (PaddlePaddle).
- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
- QQ discussion group: 778260830 (PaddlePaddle).
- [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
## Copyright and License
PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).

@ -8,8 +8,8 @@
[English](./README.md) | 简体中文
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.8/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/index_cn.html)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -19,23 +19,20 @@
## 安装
### PaddlePaddle最新版本: [v1.8](https://github.com/PaddlePaddle/Paddle/tree/release/1.8)
### PaddlePaddle最新版本: [v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0)
跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
### 安装最新稳定版本:
```
# Linux CPU
# CPU
pip install paddlepaddle
# Linux GPU cuda10cudnn7
# GPU
pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.5.post97
```
更多安装信息详见官网 [安装说明](https://www.paddlepaddle.org.cn/install/quick)
PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型更高效。**每日登陆即送12小时****连续五天运行再加送48小时**[前往使用免费算力](https://ai.baidu.com/support/news?action=detail&id=981)。
PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型更高效。**每日登陆即送10小时**[前往使用免费算力](https://aistudio.baidu.com/aistudio/index)。
## 四大领先技术
@ -64,38 +61,30 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型
## 文档
我们提供 [英文](http://www.paddlepaddle.org.cn/documentation/docs/en/1.8/beginners_guide/index_en.html) 和
[中文](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/index_cn.html) 文档
我们提供 [英文](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html) 和
[中文](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html) 文档
- [深度学习基础教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/index_cn.html)
- [使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
或许您想从深度学习基础开始学习飞桨
- [应用实践](https://www.paddlepaddle.org.cn/documentation/docs/zh/tutorial/index_cn.html)
- [典型案例](https://www.paddlepaddle.org.cn/documentation/docs/zh/user_guides/index_cn.html)
或许您已经掌握了新手入门阶段的内容,期望可以针对实际问题建模、搭建自己网络
- [进阶指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/index_cn.html)
或许您已比较熟练使用PaddlePaddle来完成常规任务期望获得更高效的模型或者定义自己的Operator
- [API Reference](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/index_cn.html)
- [API Reference](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/index_cn.html)
新的API支持代码更少更简洁的程序
- [贡献方式](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/addon_development/contribute_code/index_cn.html)
- [贡献方式](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_contribution/index_cn.html)
欢迎您的贡献!
## 交流与反馈
- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
- QQ群: 796771754 (PaddlePaddle)
- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
- QQ群: 778260830 (PaddlePaddle)
- [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
## 版权和许可证
PaddlePaddle由[Apache-2.0 license](LICENSE)提供

@ -78,6 +78,10 @@ if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS)
endif()
if(WITH_ASCEND)
add_definitions(-DPADDLE_WITH_ASCEND)
endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)
@ -126,14 +130,10 @@ if(WITH_GPU)
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
elseif(WITH_ROCM_PLATFORM)
elseif(WITH_ROCM)
add_definitions(-DPADDLE_WITH_HIP)
add_definitions(-DEIGEN_USE_GPU)
add_definitions(-DEIGEN_USE_HIP)
add_definitions(-D__HIP_PLATFORM_HCC__)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
else()
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)

@ -63,7 +63,10 @@ endfunction()
if(WITH_COVERAGE)
if (WITH_INCREMENTAL_COVERAGE)
if (NOT ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL ""))
# if *.h changed, generate coverage report totaly.
# if pybind.cc changed, generate coverage report totaly.
# Because if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail.
if ( (NOT ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "")) OR ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc") )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
endif()

@ -74,7 +74,7 @@ endfunction()
# select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
set(archs_name_default "Auto")
list(APPEND archs_names "Auto")
@ -91,7 +91,7 @@ function(select_nvcc_arch_flags out_variable)
if(${CUDA_ARCH_NAME} STREQUAL "Manual")
set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
set(CUDA_ARCH_PTX "" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
else()
unset(CUDA_ARCH_BIN CACHE)
@ -108,6 +108,8 @@ function(select_nvcc_arch_flags out_variable)
set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
set(cuda_arch_bin "80")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@ -175,14 +177,22 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1
set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
endif()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
@ -198,14 +208,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")
# Set C++11 support
# Set C++14 support
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
if (NOT WIN32) # windows msvc2015 support c++11 natively.
# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
set(CMAKE_CUDA_STANDARD 11)
endif(NOT WIN32)
set(CMAKE_CUDA_STANDARD 14)
# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
# So replace /W[1-4] with /W0
@ -216,6 +223,8 @@ endif(WIN32)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
# Set :expt-extended-lambda to enable HOSTDEVICE annotation on lambdas
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
if(WIN32)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"")

@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
"${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
message(STATUS "Current cuDNN header is ${cudnn_header_file} "
"Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
"Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
endif()
endif()
endmacro()

@ -0,0 +1,61 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
SET(ASCEND_PROJECT "extern_ascend")
IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL))
MESSAGE(STATUS "use pre defined download url")
SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE)
SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE)
SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}")
SET(ASCEND_SOURCE_DIR "${THIRD_PARTY_PATH}/ascend")
SET(ASCEND_DOWNLOAD_DIR "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}")
SET(ASCEND_DST_DIR "ascend")
SET(ASCEND_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(ASCEND_INSTALL_DIR ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR})
SET(ASCEND_ROOT ${ASCEND_INSTALL_DIR})
SET(ASCEND_INC_DIR ${ASCEND_ROOT}/include)
SET(ASCEND_LIB_DIR ${ASCEND_ROOT}/lib)
SET(ASCEND_LIB ${ASCEND_LIB_DIR}/libge_runner.so)
SET(ASCEND_GRAPH_LIB ${ASCEND_LIB_DIR}/libgraph.so)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib")
INCLUDE_DIRECTORIES(${ASCEND_INC_DIR})
FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(ASCEND)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n"
" DESTINATION ${ASCEND_DST_DIR})\n")
ExternalProject_Add(
${ASCEND_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${ASCEND_SOURCE_DIR}
DOWNLOAD_DIR ${ASCEND_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz
&& tar zxvf ${ASCEND_NAME}.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT}
)
ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB})
ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT})

@ -22,6 +22,13 @@ SET(CRYPTOPP_TAG CRYPTOPP_8_2_0)
IF(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
# There is a compilation parameter 'FI\"winapifamily.h\"' can't be used correctly
# with Ninja on Windows. The only difference between the patch file and original
# file is that the compilation parameters are changed to 'FIwinapifamily.h'. This
# patch command can be removed when upgrading to a higher version.
if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
set(CRYPTOPP_PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "<SOURCE_DIR>/")
endif()
ELSE(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
ENDIF(WIN32)
@ -53,11 +60,13 @@ ExternalProject_Add(
"${CRYPTOPP_DOWNLOAD_CMD}"
PREFIX ${CRYPTOPP_PREFIX_DIR}
SOURCE_DIR ${CRYPTOPP_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND
COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
COMMAND cd "<SOURCE_DIR>/cmake" && git checkout tags/${CRYPTOPP_TAG} -b ${CRYPTOPP_TAG}
COMMAND ${CMAKE_COMMAND} -E copy_directory "<SOURCE_DIR>/cmake/" "<SOURCE_DIR>/"
COMMAND ${CRYPTOPP_PATCH_COMMAND}
INSTALL_DIR ${CRYPTOPP_INSTALL_DIR}
CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CRYPTOPP_INSTALL_DIR}

@ -20,28 +20,28 @@ set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
set(EIGEN_TAG 4da2c6b1974827b1999bab652a3d4703e1992d26)
# the recent version of eigen will cause compilation error on windows
if(WIN32)
set(EIGEN_REPOSITORY ${GIT_URL}/eigenteam/eigen-git-mirror.git)
set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c)
endif()
# eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
if(WITH_ROCM_PLATFORM)
set(EIGEN_REPOSITORY ${GIT_URL}/sabreshao/hipeigen.git)
set(EIGEN_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e)
endif()
cache_third_party(extern_eigen3
REPOSITORY ${EIGEN_REPOSITORY}
TAG ${EIGEN_TAG}
DIR EIGEN_SOURCE_DIR)
if(WIN32)
add_definitions(-DEIGEN_STRONG_INLINE=inline)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y)
# For Windows
# which will cause a compilation error in Tensor:74:
# "can not open file 'unistd.h'"
# so use following patch to solve compilation error On Windows.
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
# For VS2015
# which will cause a compilation error in TensorBlock.h:1028:
# "syntax error"
# so use following patch to solve compilation error On Windows.
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
elseif(LINUX)
# For gxx=4.8, __GXX_ABI_VERSION is less than 1004
# which will cause a compilation error in Geometry_SSE.h:38:
@ -56,43 +56,38 @@ elseif(LINUX)
# add patch to avoid compilation error in c++11
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
if(WITH_ROCM)
# For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
# which will cause compiler error of using __host__ funciont in __host__ __device__
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
# For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
# which will cause compiler error of using __host__ funciont in __host__ __device__
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
else()
set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
endif()
endif()
set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
if(WITH_AMD_GPU)
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
"${EIGEN_DOWNLOAD_CMD}"
PREFIX ${EIGEN_PREFIX_DIR}
SOURCE_DIR ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
else()
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
"${EIGEN_DOWNLOAD_CMD}"
PREFIX ${EIGEN_PREFIX_DIR}
SOURCE_DIR ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
endif()
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
"${EIGEN_DOWNLOAD_CMD}"
PREFIX ${EIGEN_PREFIX_DIR}
SOURCE_DIR ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
add_library(eigen3 INTERFACE)

@ -30,8 +30,6 @@ ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
set(GFLAGS_NAMESPACE "paddle_gflags")
cache_third_party(extern_gflags
REPOSITORY ${GFLAGS_REPOSITORY}
TAG ${GFLAGS_TAG}
@ -59,7 +57,6 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DGFLAGS_NAMESPACE=${GFLAGS_NAMESPACE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON

@ -58,6 +58,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
-DLITE_BUILD_EXTRA=ON
-DLITE_WITH_XPU=${LITE_WITH_XPU}
-DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-DLITE_WITH_CODE_META_INFO=OFF
-DLITE_WITH_ARM=ON)
ExternalProject_Add(
${LITE_PROJECT}
@ -99,6 +100,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
-DLITE_WITH_XPU=${LITE_WITH_XPU}
-DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-DLITE_WITH_CODE_META_INFO=OFF
-DLITE_WITH_ARM=OFF)
ExternalProject_Add(

@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git)
SET(MKLDNN_TAG a18f78f1f058437e9efee403655d671633360f98)
SET(MKLDNN_TAG 72efa005effb49595933e033cc732f215ef0445a)
# Introduce variables:
# * CMAKE_INSTALL_LIBDIR

@ -250,5 +250,8 @@ IF(NOT PROTOBUF_FOUND)
SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
CACHE FILEPATH "protobuf executable." FORCE)
# `EXTERN_PROTOBUF_DEPEND` used in cmake function `proto_library` to ensure
# `protoc.exe` existed before calling it.
set(EXTERN_PROTOBUF_DEPEND extern_protobuf)
PROMPT_PROTOBUF_LIB(extern_protobuf)
ENDIF(NOT PROTOBUF_FOUND)

@ -14,11 +14,15 @@
INCLUDE(ExternalProject)
IF(WITH_ROCM)
add_definitions(-DWARPCTC_WITH_HIP)
ENDIF()
SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_TAG 95a461eddeabd51099ef059dcfada1117eb1bfb8)
set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
@ -49,14 +53,15 @@ ExternalProject_Add(
BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
-DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
-DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU}
-DWITH_ROCM=${WITH_ROCM}
-DWITH_OMP=${USE_OMP}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON

@ -20,7 +20,7 @@ SET(XBYAK_SOURCE_DIR ${THIRD_PARTY_PATH}/xbyak/src/extern_xbyak)
set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include)
set(XBYAK_REPOSITORY ${GIT_URL}/herumi/xbyak.git)
set(XBYAK_TAG v5.661) # Jul 26th
set(XBYAK_TAG v5.81) # Dec 19, 2019
include_directories(${XBYAK_INC_DIR})
include_directories(${XBYAK_INC_DIR}/xbyak)
@ -44,7 +44,7 @@ ExternalProject_Add(
DEPENDS ""
PREFIX ${XBYAK_PREFIX_DIR}
SOURCE_DIR ${XBYAK_SOURCE_DIR}
UPDATE_COMMAND ""
# UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
)

@ -4,49 +4,55 @@ endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
if (WITH_AARCH64)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2020_1229.tar.gz" CACHE STRING "" FORCE)
elseif(WITH_SUNWAY)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2020_1227.tar.gz" CACHE STRING "" FORCE)
else()
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
endif()
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
if(NOT XPU_SDK_ROOT)
if (WITH_AARCH64)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
elseif(WITH_SUNWAY)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
else()
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_03_30.tar.gz" CACHE STRING "" FORCE)
endif()
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/include xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
&& tar xvf xpu.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/include xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
&& tar xvf xpu.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
else()
SET(XPU_API_INC_DIR "${XPU_SDK_ROOT}/XTDK/include/")
SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so")
SET(XPU_LIB_DIR "${XPU_SDK_ROOT}/XTDK/shlib/")
endif()
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
@ -69,4 +75,14 @@ else(WITH_XPU_BKCL)
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
endif(WITH_XPU_BKCL)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
if(NOT XPU_SDK_ROOT)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
else()
ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib)
endif()
# Ensure that xpu/api.h can be included without dependency errors.
file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
add_dependencies(xpu_headers_dummy extern_xpu)
link_libraries(xpu_headers_dummy)

@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
include(CheckCXXSymbolExists)
include(CheckTypeSize)
function(CheckCompilerCXX11Flag)
function(CheckCompilerCXX14Flag)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
endif()
@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag)
message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
endif()
else()
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
endif()
endif()
endif()
endfunction()
CheckCompilerCXX11Flag()
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
CheckCompilerCXX14Flag()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
# safe_set_flag
#
# Set a compile flag only if compiler is support
@ -155,7 +147,7 @@ set(COMMON_FLAGS
)
if(NOT APPLE)
if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM_PLATFORM AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 7.3))
if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
set(COMMON_FLAGS
${COMMON_FLAGS}
-Wno-format-truncation # Warning in boost gcc 8.2
@ -213,5 +205,17 @@ foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag})
endforeach()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
if(WITH_GPU)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
endif()
if(WITH_ROCM)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
endif()
# Disable -Werror, otherwise the compile will fail for rocblas_gemm_ex
if(WITH_ROCM)
string (REPLACE "-Werror" "-Wno-error" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
string (REPLACE "-Werror" "-Wno-error" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
endif()

@ -260,15 +260,16 @@ function(merge_static_libs TARGET_NAME)
# msvc will put libarary in directory of "/Release/xxxlib" by default
# COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles}
COMMAND cmake -E make_directory $<TARGET_FILE_DIR:${TARGET_NAME}>
COMMAND lib /OUT:$<TARGET_FILE:${TARGET_NAME}> ${libfiles}
)
endif(WIN32)
endfunction(merge_static_libs)
function(check_coverage_opt TARGET_NAME SRCS)
if(WITH_COVERAGE AND WITH_INCREMENTAL_COVERAGE)
if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "")
# if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail.
if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "" AND (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc")))
if (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL ""))
string(REPLACE "," ";" CC_FILE_LIST $ENV{PADDLE_GIT_DIFF_CC_FILE})
set(use_coverage_opt FALSE)
@ -381,6 +382,9 @@ function(cc_binary TARGET_NAME)
endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
if(WITH_ROCM)
target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
endif()
check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
@ -402,6 +406,9 @@ function(cc_test_build TARGET_NAME)
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME})
if(WITH_ROCM)
target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
endif()
endif()
check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
@ -485,7 +492,7 @@ function(nv_library TARGET_NAME)
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(nv_library_SRCS)
if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif()
endif()
@ -503,7 +510,7 @@ function(nv_binary TARGET_NAME)
add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
common_link(${TARGET_NAME})
endif()
if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif()
endif()
@ -530,40 +537,31 @@ function(nv_test TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif()
endif()
endfunction(nv_test)
function(hip_library TARGET_NAME)
if (WITH_ROCM_PLATFORM)
if (WITH_ROCM)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_library_SRCS})
set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
if(hip_library_SRCS)
# FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found
if(NOT ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators")
set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
endif()
if (hip_library_SHARED OR hip_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
else()
add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so)
hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
find_fluid_modules(${TARGET_NAME})
endif()
if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
else()
if (hip_library_DEPS)
add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
endif()
# cpplint code style
@ -573,72 +571,27 @@ function(hip_library TARGET_NAME)
list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
check_coverage_opt(${TARGET_NAME} ${hip_library_SRCS})
else(hip_library_SRCS)
if (hip_library_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
list(REMOVE_DUPLICATES hip_library_DEPS)
generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:hip_library")
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
else()
message(FATAL "Please specify source file or library in nv_library.")
message(FATAL "Please specify source file or library in hip_library.")
endif()
endif(hip_library_SRCS)
endif()
endfunction(hip_library)
function(hip_library_ops TARGET_NAME)
if (WITH_ROCM_PLATFORM)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_library_ops "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_library_ops_SRCS})
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
if(hip_library_ops_SRCS)
if (hip_library_ops_SHARED OR hip_library_ops_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
else()
add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so)
find_fluid_modules(${TARGET_NAME})
endif()
if("${hip_library_ops_DEPS}" MATCHES "ARCHIVE_START")
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS})
list(REMOVE_ITEM hip_library_ops_DEPS ARCHIVE_START ARCHIVE_END)
else()
target_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS})
endif()
# cpplint code style
foreach(source_file ${hip_library_ops_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND hip_library_ops_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
else(hip_library_ops_SRCS)
if (hip_library_ops_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_ops_DEPS})
else()
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(hip_library_ops_SRCS)
endif()
endfunction(hip_library_ops)
function(hip_binary TARGET_NAME)
if (WITH_ROCM_PLATFORM)
if (WITH_ROCM)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
if(hip_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
@ -646,34 +599,29 @@ function(hip_binary TARGET_NAME)
common_link(${TARGET_NAME})
endif()
endif()
check_coverage_opt(${TARGET_NAME} ${hip_binary_SRCS})
endfunction(hip_binary)
function(hip_test TARGET_NAME)
if (WITH_ROCM_PLATFORM AND WITH_TESTING)
set(options "")
# The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
# and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
# other than *.py are modified.
if (WITH_ROCM AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_test_SRCS})
set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
# FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
hip_add_executable(${TARGET_NAME} ${hip_test_SRCS})
# "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME})
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
check_coverage_opt(${TARGET_NAME} ${hip_test_SRCS})
endfunction(hip_test)
function(go_library TARGET_NAME)
@ -805,7 +753,8 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-I${CMAKE_CURRENT_SOURCE_DIR}
--cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
DEPENDS ${ABS_FIL} protoc
# Set `EXTERN_PROTOBUF_DEPEND` only if need to compile `protoc.exe`.
DEPENDS ${ABS_FIL} ${EXTERN_PROTOBUF_DEPEND}
COMMENT "Running C++ protocol buffer compiler on ${FIL}"
VERBATIM )
endforeach()
@ -854,8 +803,7 @@ function(py_test TARGET_NAME)
else()
add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
FLAGS_cpu_deterministic=true ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()

@ -1,104 +1,87 @@
if(NOT WITH_ROCM_PLATFORM)
if(NOT WITH_ROCM)
return()
endif()
include_directories("${ROCM_PATH}/include")
include_directories("${ROCM_PATH}/hip/include")
include_directories("${ROCM_PATH}/miopen/include")
include_directories("${ROCM_PATH}/hipblas/include")
include_directories("${ROCM_PATH}/rocblas/include")
include_directories("${ROCM_PATH}/hiprand/include")
include_directories("${ROCM_PATH}/rocrand/include")
include_directories("${ROCM_PATH}/rccl/include")
include_directories("${ROCM_PATH}/rocthrust/include/")
include_directories("${ROCM_PATH}/hipcub/include/")
include_directories("${ROCM_PATH}/rocprim/include/")
include_directories("${ROCM_PATH}/hipsparse/include/")
include_directories("${ROCM_PATH}/rocsparse/include/")
include_directories("${ROCM_PATH}/rocfft/include/")
set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "")
set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "")
# now default is clang
set(HIP_COMPILER "clang")
list(APPEND EXTERNAL_LIBS "-L${ROCM_PATH}/lib/ -lhip_hcc")
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -DEIGEN_USE_HIP -DEIGEN_USE_GPU -D__HIP_NO_HALF_CONVERSIONS__ -std=c++11 --amdgpu-target=gfx906" )
if(WITH_RCCL)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_RCCL")
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed")
endif()
if(NOT WITH_PYTHON)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_NO_PYTHON")
endif(NOT WITH_PYTHON)
if(WITH_DSO)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_USE_DSO")
endif(WITH_DSO)
if(WITH_TESTING)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_TESTING")
endif(WITH_TESTING)
if(WITH_DISTRIBUTE)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
endif(WITH_DISTRIBUTE)
if(WITH_GRPC)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC)
if(WITH_MKLDNN)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP REQUIRED)
include_directories(${ROCM_PATH}/include)
message(STATUS "HIP version: ${HIP_VERSION}")
message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")
macro(find_package_and_include PACKAGE_NAME)
find_package("${PACKAGE_NAME}" REQUIRED)
include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include")
message(STATUS "${PACKAGE_NAME} version: ${${PACKAGE_NAME}_VERSION}")
endmacro()
find_package_and_include(miopen)
find_package_and_include(rocblas)
find_package_and_include(hiprand)
find_package_and_include(rocrand)
find_package_and_include(rccl)
find_package_and_include(rocthrust)
find_package_and_include(hipcub)
find_package_and_include(rocprim)
find_package_and_include(hipsparse)
find_package_and_include(rocsparse)
find_package_and_include(rocfft)
# set CXX flags for HIP
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
# define HIP_CXX_FLAGS
list(APPEND HIP_CXX_FLAGS -fPIC)
list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1)
# Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined)
list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override)
list(APPEND HIP_CXX_FLAGS -Wno-exceptions)
list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative)
list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow)
list(APPEND HIP_CXX_FLAGS -Wno-unused-command-line-argument)
list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier)
list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion)
list(APPEND HIP_CXX_FLAGS -Wno-pass-failed)
list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
list(APPEND HIP_CXX_FLAGS -std=c++14)
if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CXX_FLAGS -g2)
list(APPEND HIP_CXX_FLAGS -O0)
list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
endif(CMAKE_BUILD_TYPE MATCHES Debug)
set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
# Ask hcc to generate device code during compilation so we can use
# host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64)
else()
set(hip_library_name hip_hcc)
endif()
message(STATUS "HIP library name: ${hip_library_name}")
if("${HIP_COMPILER}" STREQUAL "hcc")
if("x${HCC_HOME}" STREQUAL "x")
set(HCC_HOME "${ROCM_PATH}/hcc")
endif()
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -ldl --amdgpu-target=gfx906 ")
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906")
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906")
elseif("${HIP_COMPILER}" STREQUAL "clang")
if("x${HIP_CLANG_PATH}" STREQUAL "x")
set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin")
endif()
#Number of parallel jobs by default is 1
if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS)
set(HIP_CLANG_NUM_PARALLEL_JOBS 1)
endif()
#Add support for parallel build and link
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS)
endif()
if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1)
if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS})
set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS} -Wno-format-nonliteral")
set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}")
else()
message("clang compiler doesn't support parallel jobs")
endif()
endif()
# Set the CMake Flags to use the HIP-Clang Compiler.
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> --amdgpu-target=gfx906")
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906" )
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -ldl --amdgpu-target=gfx906")
endif()
# set HIP link libs
find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib)
message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}")

@ -137,7 +137,7 @@ function(copy_part_of_thrid_party TARGET DST)
endfunction()
# inference library for only inference
set(inference_lib_deps third_party paddle_fluid paddle_fluid_c paddle_fluid_shared paddle_fluid_c_shared)
set(inference_lib_deps third_party paddle_inference paddle_inference_c paddle_inference_shared paddle_inference_c_shared)
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
@ -164,20 +164,20 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32)
if(WITH_STATIC_LIB)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
set(paddle_inference_lib $<TARGET_FILE_DIR:paddle_inference>/libpaddle_inference.lib
$<TARGET_FILE_DIR:paddle_inference>/paddle_inference.*)
else()
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
set(paddle_inference_lib $<TARGET_FILE_DIR:paddle_inference_shared>/paddle_inference.dll
$<TARGET_FILE_DIR:paddle_inference_shared>/paddle_inference.lib)
endif()
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
set(paddle_inference_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_inference.*)
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
endif(WIN32)
@ -189,6 +189,19 @@ copy(inference_lib_dist
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
# CAPI inference library for only inference
set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
"A path setting CAPI paddle inference shared")
@ -196,13 +209,13 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
set(paddle_inference_c_lib $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
else(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*)
endif(WIN32)
copy(inference_lib_dist
SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_fluid_c_lib}
SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_inference_c_lib}
DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
# fluid library for both train and inference
@ -213,12 +226,12 @@ set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
set(module "inference")
if(WIN32)
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_inference_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
else()
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_inference_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
endif()

@ -18,6 +18,10 @@ if(NOT WIN32)
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
else()
# It has not been used now, it can specify CUDA compile flag manualy,
# its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
# because CUDA will update by nvidia, then error will occur.
# Now, it's used in CUDA:[10.0, 10.2]
set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
endif()

@ -7,13 +7,16 @@ function(op_library TARGET)
# for ops.
set(cc_srcs)
set(cu_srcs)
set(hip_cu_srcs)
set(miopen_hip_cc_srcs)
set(hip_srcs)
set(cu_cc_srcs)
set(hip_cc_srcs)
set(xpu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(miopen_cu_cc_srcs)
set(cudnn_cu_srcs)
set(miopen_cu_srcs)
set(CUDNN_FILE)
set(MIOPEN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
@ -30,46 +33,44 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
endif()
string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
if(WITH_GPU)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
endif()
string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
endif()
endif()
if(WITH_ROCM_PLATFORM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu.cc)
list(APPEND hip_cu_cc_srcs ${TARGET}.hip.cu.cc)
if(WITH_ROCM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND hip_cc_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND hip_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
list(APPEND hip_cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu)
list(APPEND hip_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
endif()
string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu.cc)
list(APPEND miopen_hip_cu_cc_srcs ${MIOPEN_FILE}.hip.cu.cc)
string(REPLACE "_op" "_cudnn_op" MIOPEN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu.cc)
list(APPEND miopen_cu_cc_srcs ${MIOPEN_FILE}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu)
list(APPEND miopen_hip_cu_srcs ${MIOPEN_FILE}.hip.cu)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu)
list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu)
endif()
endif()
if(WITH_MKLDNN)
@ -86,20 +87,20 @@ function(op_library TARGET)
endif()
else()
foreach(src ${op_library_SRCS})
if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$")
list(APPEND hip_cu_srcs ${src})
elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu.cc$")
list(APPEND hip_cu_cc_srcs ${src})
if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
list(APPEND miopen_cu_srcs ${src})
elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu$")
list(APPEND hip_srcs ${src})
elseif(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND miopen_cu_cc_srcs ${src})
elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$")
list(APPEND hip_cc_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu$")
list(APPEND cudnn_cu_srcs ${src})
elseif (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cc$")
list(APPEND miopen_hip_cc_srcs ${src})
elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cu$")
list(APPEND miopen_hip_cu_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
@ -163,8 +164,13 @@ function(op_library TARGET)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
elseif (WITH_ROCM_PLATFORM)
hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
elseif (WITH_ROCM)
list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
list(REMOVE_ITEM hip_srcs "correlation_op.cu")
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
@ -191,7 +197,7 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
"fused_bn_add_activation_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
@ -227,13 +233,14 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH hip_srcs hip_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH hip_cc_srcs hip_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
@ -248,26 +255,26 @@ function(op_library TARGET)
endif()
endif()
# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len)
if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0)
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
if (WITH_ROCM AND ${miopen_cu_cc_srcs_len} GREATER 0)
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
endif()
# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len)
if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
list(LENGTH miopen_cu_srcs miopen_cu_srcs_len)
if (WITH_ROCM AND ${miopen_cu_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)

@ -15,7 +15,7 @@
<Warning>InheritFromHost</Warning>
<BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
<BuildCommandLineTemplate>--use-local-env --cl-version $(CudaClVersion)</BuildCommandLineTemplate>
<BuildCommandLineTemplate>--use-local-env $(CudaClVersion)</BuildCommandLineTemplate>
<BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
<CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
<!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->

@ -0,0 +1,28 @@
if(NOT WITH_ROCM)
return()
endif()
# Now we don't support RCCL on windows
if(WIN32)
return()
endif()
if(WITH_RCCL)
set(RCCL_ROOT ${ROCM_PATH}/rccl CACHE PATH "RCCL ROOT")
find_path(RCCL_INCLUDE_DIR rccl.h
PATHS ${RCCL_ROOT} ${RCCL_ROOT}/include ${RCCL_ROOT}/local/include
$ENV{RCCL_ROOT} $ENV{RCCL_ROOT}/include $ENV{RCCL_ROOT}/local/include
NO_DEFAULT_PATH
)
file(READ ${RCCL_INCLUDE_DIR}/rccl.h RCCL_VERSION_FILE_CONTENTS)
string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)"
RCCL_VERSION "${RCCL_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1"
RCCL_VERSION "${RCCL_VERSION}")
# 2604 for ROCM3.5 and 2708 for ROCM 3.9
message(STATUS "Current RCCL header is ${RCCL_INCLUDE_DIR}/rccl.h. "
"Current RCCL version is v${RCCL_VERSION}. ")
endif()

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save