|
|
|
@ -29,8 +29,12 @@ limitations under the License. */
|
|
|
|
|
#include "paddle/fluid/platform/gpu_info.h"
|
|
|
|
|
#include "paddle/fluid/platform/transform.h"
|
|
|
|
|
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#include <cuda.h>
|
|
|
|
|
#elif defined(__HIPCC__)
|
|
|
|
|
#include <hip/hip_runtime.h>
|
|
|
|
|
#endif
|
|
|
|
|
#include <thrust/iterator/iterator_adaptor.h>
|
|
|
|
|
|
|
|
|
|
#include "paddle/fluid/platform/cuda_device_function.h"
|
|
|
|
@ -196,7 +200,7 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
template <typename Functor, typename T, typename OutType>
|
|
|
|
|
__global__ void ElementwiseKernel(const T *x, const T *y, OutType *out, int pre,
|
|
|
|
|
int n, int post, int total, Functor func) {
|
|
|
|
@ -310,7 +314,7 @@ void CommonForwardBroadcastCUDA(
|
|
|
|
|
y_data, out_data, out_size, max_dim, func, is_xsize_larger);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // __NVCC__
|
|
|
|
|
#endif // __NVCC__ or __HIPCC__
|
|
|
|
|
|
|
|
|
|
template <typename T, typename DX_OP, typename DY_OP>
|
|
|
|
|
void CommonGradBroadcastCPU(
|
|
|
|
@ -382,7 +386,7 @@ inline void ComputeBroadcastTranspositionArray(const int *x_one_indexs,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
template <typename T, typename DX_OP, typename DY_OP>
|
|
|
|
|
static __global__ void ElemwiseGradBroadcast1CUDAKernel(
|
|
|
|
|
const T *x, const T *y, const T *out, const T *dout, int h, int w,
|
|
|
|
@ -1212,7 +1216,7 @@ void CommonGradBroadcastCUDA(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // __NVCC__
|
|
|
|
|
#endif // __NVCC__ or __HIPCC__
|
|
|
|
|
|
|
|
|
|
inline framework::DDim trim_trailing_singular_dims(
|
|
|
|
|
const framework::DDim &dims) {
|
|
|
|
@ -1339,7 +1343,7 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
|
|
|
|
|
int64_t post_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
template <typename T>
|
|
|
|
|
class RowwiseTransformIterator<T, platform::CUDADeviceContext>
|
|
|
|
|
: public thrust::iterator_adaptor<
|
|
|
|
@ -1504,10 +1508,10 @@ static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
|
|
|
|
|
template <typename T, typename DX_OP, typename DY_OP>
|
|
|
|
|
static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
|
|
|
|
|
static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream, const T *x,
|
|
|
|
|
const T *y, const T *out, const T *dout,
|
|
|
|
|
int h, int w, bool is_xsize_larger,
|
|
|
|
|
DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
|
|
|
|
@ -1577,7 +1581,7 @@ static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
template <typename T, typename DX_OP, typename DY_OP>
|
|
|
|
|
static __global__ void ElemwiseGradBroadcast2CUDAKernel(
|
|
|
|
|
const T *x, const T *y, const T *out, const T *dout, int pre, int n,
|
|
|
|
@ -1646,7 +1650,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T, typename DX_OP, typename DY_OP>
|
|
|
|
|
static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x,
|
|
|
|
|
static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream, const T *x,
|
|
|
|
|
const T *y, const T *out, const T *dout,
|
|
|
|
|
int pre, int n, int post,
|
|
|
|
|
bool is_xsize_larger, DX_OP dx_op,
|
|
|
|
@ -1686,7 +1690,7 @@ void CommonElementwiseBroadcastBackward(
|
|
|
|
|
<< " ydim:" << framework::make_ddim(y_dims_array);
|
|
|
|
|
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
CommonGradBroadcastCUDA<T, DX_OP, DY_OP>(
|
|
|
|
|
x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
|
|
|
|
|
out_dims_array.data(), max_dim,
|
|
|
|
@ -1769,7 +1773,7 @@ void ElemwiseGradComputeWithBroadcast(
|
|
|
|
|
}
|
|
|
|
|
if (post == 1) {
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
ElemwiseGradBroadcast1CUDA(
|
|
|
|
|
ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
|
|
|
|
|
y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, is_xsize_larger,
|
|
|
|
@ -1786,7 +1790,7 @@ void ElemwiseGradComputeWithBroadcast(
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
ElemwiseGradBroadcast2CUDA(
|
|
|
|
|
ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
|
|
|
|
|
y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
|
|
|
|
@ -1830,7 +1834,7 @@ void CommonElementwiseBroadcastForward(
|
|
|
|
|
axis);
|
|
|
|
|
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
CommonForwardBroadcastCUDA<Functor, T, OutType>(
|
|
|
|
|
x, y, z, x_dims_array.data(), y_dims_array.data(),
|
|
|
|
|
out_dims_array.data(), max_dim,
|
|
|
|
@ -1942,7 +1946,7 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
ComputeElementwiseCUDA<Functor, T, OutType>(
|
|
|
|
|
x, y, z, pre, n, post,
|
|
|
|
|
ctx.template device_context<platform::CUDADeviceContext>(), func,
|
|
|
|
@ -2066,7 +2070,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, const T *y, int pre,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
template <typename T, typename CompoundFunctor, bool BcastY,
|
|
|
|
|
bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
|
|
|
|
|
static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
|
|
|
|
@ -2107,7 +2111,7 @@ static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
|
|
|
|
|
|
|
|
|
|
template <typename T, typename CompoundFunctor, bool BcastY,
|
|
|
|
|
bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
|
|
|
|
|
static void FusedElemwiseAndActBroadcast1CUDA(cudaStream_t stream, const T *x,
|
|
|
|
|
static void FusedElemwiseAndActBroadcast1CUDA(gpuStream_t stream, const T *x,
|
|
|
|
|
const T *y,
|
|
|
|
|
CompoundFunctor compound_functor,
|
|
|
|
|
int h, int w, T *out,
|
|
|
|
@ -2164,7 +2168,7 @@ static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel(
|
|
|
|
|
|
|
|
|
|
template <typename T, typename CompoundFunctor, bool BcastY,
|
|
|
|
|
bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
|
|
|
|
|
static void FusedElemwiseAndActBroadcast2CUDA(cudaStream_t stream, const T *x,
|
|
|
|
|
static void FusedElemwiseAndActBroadcast2CUDA(gpuStream_t stream, const T *x,
|
|
|
|
|
const T *y, int pre, int n,
|
|
|
|
|
int post,
|
|
|
|
|
CompoundFunctor compound_functor,
|
|
|
|
@ -2219,7 +2223,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
|
|
|
|
|
int h = pre;
|
|
|
|
|
int w = n;
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
FusedElemwiseAndActBroadcast1CUDA<T, CompoundFunctor, BcastY,
|
|
|
|
|
KeepIntermediateOut,
|
|
|
|
|
SameShapeOfIntermediateOutAndOut>(
|
|
|
|
@ -2242,7 +2246,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
FusedElemwiseAndActBroadcast2CUDA<T, CompoundFunctor, BcastY,
|
|
|
|
|
KeepIntermediateOut,
|
|
|
|
|
SameShapeOfIntermediateOutAndOut>(
|
|
|
|
@ -2493,7 +2497,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
|
|
|
|
|
bool UseIntermediateOut, bool BcastY,
|
|
|
|
|
bool SameShapeOfIntermediateOutAndOut>
|
|
|
|
@ -2593,7 +2597,7 @@ template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
|
|
|
|
|
bool UseIntermediateOut, bool BcastY,
|
|
|
|
|
bool SameShapeOfIntermediateOutAndOut>
|
|
|
|
|
static void FusedElemwiseAndActGradBroadcast1CUDA(
|
|
|
|
|
cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
|
|
|
|
|
gpuStream_t stream, const T *x, const T *y, const T *intermediate_out,
|
|
|
|
|
const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
|
|
|
|
|
DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
|
|
|
|
|
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
|
|
|
|
@ -2708,7 +2712,7 @@ template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
|
|
|
|
|
bool UseIntermediateOut, bool BcastY,
|
|
|
|
|
bool SameShapeOfIntermediateOutAndOut>
|
|
|
|
|
static void FusedElemwiseAndActGradBroadcast2CUDA(
|
|
|
|
|
cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
|
|
|
|
|
gpuStream_t stream, const T *x, const T *y, const T *intermediate_out,
|
|
|
|
|
const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op,
|
|
|
|
|
DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy,
|
|
|
|
|
T *dintermediate) {
|
|
|
|
@ -2748,7 +2752,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
|
|
|
|
|
int w = n;
|
|
|
|
|
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
FusedElemwiseAndActGradBroadcast1CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
|
|
|
|
|
UseIntermediateOut, BcastY,
|
|
|
|
|
SameShapeOfIntermediateOutAndOut>(
|
|
|
|
@ -2774,7 +2778,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#if defined(__NVCC__) || defined(__HIPCC__)
|
|
|
|
|
FusedElemwiseAndActGradBroadcast2CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
|
|
|
|
|
UseIntermediateOut, BcastY,
|
|
|
|
|
SameShapeOfIntermediateOutAndOut>(
|
|
|
|
|