|
|
|
@ -19,20 +19,58 @@
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void SquareSumAllKernel(const size_t size, const T* input_addr_0, const T* input_addr_1,
|
|
|
|
|
T* output_addr_0, T* output_addr_1) {
|
|
|
|
|
float* ws_addr_0, float* ws_addr_1) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
|
|
|
|
|
size_t split = size / 2;
|
|
|
|
|
float power = 2.0;
|
|
|
|
|
if (i < split) {
|
|
|
|
|
T ret = input_addr_0[i] * input_addr_0[i];
|
|
|
|
|
MsAtomicAdd(output_addr_0, ret);
|
|
|
|
|
float ret = powf(__half2float(input_addr_0[i]), power);
|
|
|
|
|
MsAtomicAdd(ws_addr_0, ret);
|
|
|
|
|
} else {
|
|
|
|
|
T ret = input_addr_1[i - split] * input_addr_1[i - split];
|
|
|
|
|
MsAtomicAdd(output_addr_1, ret);
|
|
|
|
|
float ret = powf(__half2float(input_addr_1[i - split]), power);
|
|
|
|
|
MsAtomicAdd(ws_addr_1, ret);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <>
|
|
|
|
|
__global__ void SquareSumAllKernel(const size_t size, const float* input_addr_0, const float* input_addr_1,
|
|
|
|
|
float* ws_addr_0, float* ws_addr_1) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
|
|
|
|
|
size_t split = size / 2;
|
|
|
|
|
float power = 2.0;
|
|
|
|
|
if (i < split) {
|
|
|
|
|
float ret = powf(input_addr_0[i], power);
|
|
|
|
|
MsAtomicAdd(ws_addr_0, ret);
|
|
|
|
|
} else {
|
|
|
|
|
float ret = powf(input_addr_1[i - split], power);
|
|
|
|
|
MsAtomicAdd(ws_addr_1, ret);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void AssignKernel(const size_t size, T* output_addr_0, T* output_addr_1,
|
|
|
|
|
float* ws_addr_0, float* ws_addr_1) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
|
|
|
|
|
output_addr_0[0] = __float2half(ws_addr_0[0]);
|
|
|
|
|
output_addr_1[0] = __float2half(ws_addr_1[0]);
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <>
|
|
|
|
|
__global__ void AssignKernel(const size_t size, float* output_addr_0, float* output_addr_1,
|
|
|
|
|
float* ws_addr_0, float* ws_addr_1) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
|
|
|
|
|
output_addr_0[0] = ws_addr_0[0];
|
|
|
|
|
output_addr_1[0] = ws_addr_1[0];
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void InitOutput(const size_t size, T *output) {
|
|
|
|
|
T zero = 0;
|
|
|
|
@ -44,15 +82,19 @@ __global__ void InitOutput(const size_t size, T *output) {
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
void SquareSumAll(const size_t input_size_, const T* input_addr_0, const T* input_addr_1,
|
|
|
|
|
T* output_addr_0, T* output_addr_1, cudaStream_t cuda_stream) {
|
|
|
|
|
InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, output_addr_0);
|
|
|
|
|
InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, output_addr_1);
|
|
|
|
|
T* output_addr_0, T* output_addr_1,
|
|
|
|
|
float* ws_addr_0, float* ws_addr_1, cudaStream_t cuda_stream) {
|
|
|
|
|
InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, ws_addr_0);
|
|
|
|
|
InitOutput<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, ws_addr_1);
|
|
|
|
|
size_t size = input_size_ * 2;
|
|
|
|
|
SquareSumAllKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr_0, input_addr_1,
|
|
|
|
|
output_addr_0, output_addr_1);
|
|
|
|
|
ws_addr_0, ws_addr_1);
|
|
|
|
|
AssignKernel<<<GET_BLOCKS(1), GET_THREADS, 0, cuda_stream>>>(1, output_addr_0, output_addr_1, ws_addr_0, ws_addr_1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template void SquareSumAll(const size_t input_size_, const half* input_addr_0, const half* input_addr_1,
|
|
|
|
|
half* output_addr_0, half* output_addr_1, cudaStream_t cuda_stream);
|
|
|
|
|
half* output_addr_0, half* output_addr_1, float* ws_addr_0, float* ws_addr_1,
|
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
|
template void SquareSumAll(const size_t input_size_, const float* input_addr_0, const float* input_addr_1,
|
|
|
|
|
float* output_addr_0, float* output_addr_1, cudaStream_t cuda_stream);
|
|
|
|
|
float* output_addr_0, float* output_addr_1, float* ws_addr_0, float* ws_addr_1,
|
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
|