|
|
|
@ -18,31 +18,27 @@
|
|
|
|
|
#include "backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cuh"
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void ScatterAdd(const int inner_size, const int updates_size, const bool use_locking, const int *indices,
|
|
|
|
|
const T *updates, T *output) {
|
|
|
|
|
__global__ void ScatterAdd(const int inner_size, const int updates_size, const int *indices, const T *updates,
|
|
|
|
|
T *output) {
|
|
|
|
|
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
|
|
|
|
|
const size_t index = pos / inner_size;
|
|
|
|
|
const size_t offset = pos % inner_size;
|
|
|
|
|
const size_t current_pos = indices[index] * inner_size + offset;
|
|
|
|
|
if (use_locking) {
|
|
|
|
|
MsAtomicAdd(&output[current_pos], updates[pos]);
|
|
|
|
|
} else {
|
|
|
|
|
output[current_pos] += updates[pos];
|
|
|
|
|
}
|
|
|
|
|
MsAtomicAdd(&output[current_pos], updates[pos]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
void CalScatterAdd(const int &inner_size, const int &indices_size, const bool &use_locking, const int *indices,
|
|
|
|
|
const T *updates, T *output, cudaStream_t cuda_stream) {
|
|
|
|
|
void CalScatterAdd(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *output,
|
|
|
|
|
cudaStream_t cuda_stream) {
|
|
|
|
|
const int updates_size = inner_size * indices_size;
|
|
|
|
|
ScatterAdd<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size, use_locking,
|
|
|
|
|
indices, updates, output);
|
|
|
|
|
ScatterAdd<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size, indices, updates,
|
|
|
|
|
output);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template void CalScatterAdd<float>(const int &inner_size, const int &indices_size, const bool &use_locking,
|
|
|
|
|
const int *indices, const float *updates, float *output, cudaStream_t cuda_stream);
|
|
|
|
|
template void CalScatterAdd<half>(const int &inner_size, const int &indices_size, const bool &use_locking,
|
|
|
|
|
const int *indices, const half *updates, half *output, cudaStream_t cuda_stream);
|
|
|
|
|
template void CalScatterAdd<int>(const int &inner_size, const int &indices_size, const bool &use_locking,
|
|
|
|
|
const int *indices, const int *updates, int *output, cudaStream_t cuda_stream);
|
|
|
|
|
template void CalScatterAdd<float>(const int &inner_size, const int &indices_size, const int *indices,
|
|
|
|
|
const float *updates, float *output, cudaStream_t cuda_stream);
|
|
|
|
|
template void CalScatterAdd<half>(const int &inner_size, const int &indices_size, const int *indices,
|
|
|
|
|
const half *updates, half *output, cudaStream_t cuda_stream);
|
|
|
|
|
template void CalScatterAdd<int>(const int &inner_size, const int &indices_size, const int *indices, const int *updates,
|
|
|
|
|
int *output, cudaStream_t cuda_stream);
|
|
|
|
|