!4958 Fix GPU-ArgMaxWithValue

Merge pull request !4958 from 34bunny/GPU-argmaxwithvalue-fix
pull/4958/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 660aa8e60d

@ -18,39 +18,34 @@
#include "runtime/device/gpu/cuda_common.h"
#include "include/cuda_fp16.h"
template <typename T, typename S>
__global__ void ArgmaxWithValue(const T* input, const int bound, int outerSize, int innerSize, S* index,
T* output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (outerSize); pos += blockDim.x * gridDim.x) {
int inputOutterOffset = pos * innerSize * bound;
int outputOutterOffset = pos * innerSize;
for (int j = 0; j < innerSize; j++) {
auto outputInnerOffset = outputOutterOffset + j;
S idx = 0;
T maxData = input[j + inputOutterOffset];
for (S c = 0; c < bound; c++) {
int offset = j + c * innerSize;
auto inputData = input[inputOutterOffset + offset];
idx = inputData > maxData ? c : idx;
maxData = inputData > maxData ? inputData : maxData;
}
output[outputInnerOffset] = maxData;
index[outputInnerOffset] = idx;
}
__global__ void ArgmaxWithValue(const T *input, const int bound, int outerSize, int innerSize, S *index, T *output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < outerSize * innerSize; pos += gridDim.x * blockDim.x) {
int x = pos / innerSize % outerSize;
int y = pos % innerSize;
S idx = 0;
int InputOffset = x * bound * innerSize + 0 * innerSize + y;
T maxData = input[InputOffset];
for (int i = 0; i < bound; i++) {
InputOffset = x * bound * innerSize + i * innerSize + y;
auto inputData = input[InputOffset];
idx = inputData > maxData ? i : idx;
maxData = inputData > maxData ? inputData : maxData;
}
output[pos] = maxData;
index[pos] = idx;
}
return;
}
template <typename T, typename S>
void CalArgmaxWithValue(const T* input, const int bound_, const int outerSize_, const int innerSize_,
S* index, T* output, cudaStream_t cuda_stream) {
ArgmaxWithValue<<<GET_BLOCKS(outerSize_), GET_THREADS, 0, cuda_stream>>>(input, bound_, outerSize_, innerSize_,
index, output);
void CalArgmaxWithValue(const T *input, const int bound_, const int outerSize_, const int innerSize_, S *index,
T *output, cudaStream_t cuda_stream) {
ArgmaxWithValue<<<GET_BLOCKS(outerSize_), GET_THREADS, 0, cuda_stream>>>(input, bound_, outerSize_, innerSize_, index,
output);
return;
}
template void CalArgmaxWithValue<float, int>(const float* input, const int bound_, const int outerSize_,
const int innerSize_, int* index, float* output,
cudaStream_t cuda_stream);
template void CalArgmaxWithValue<half, int>(const half* input, const int bound_, const int outerSize_,
const int innerSize_, int* index, half* output,
cudaStream_t cuda_stream);
template void CalArgmaxWithValue<float, int>(const float *input, const int bound_, const int outerSize_,
const int innerSize_, int *index, float *output, cudaStream_t cuda_stream);
template void CalArgmaxWithValue<half, int>(const half *input, const int bound_, const int outerSize_,
const int innerSize_, int *index, half *output, cudaStream_t cuda_stream);

Loading…
Cancel
Save