|
|
|
@ -19,19 +19,26 @@ template <typename T>
|
|
|
|
|
__global__ void NormalKernel(int seed, curandState *globalState, T *output, size_t count) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
|
|
|
|
curand_init(seed, i, 0, &globalState[i]);
|
|
|
|
|
output[i] = curand_normal(&globalState[i]);
|
|
|
|
|
output[i] = (T)curand_normal(&globalState[i]);
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void UniformKernel(int seed, curandState *globalState, T *input1, size_t input_size_1,
|
|
|
|
|
T *input2, size_t input_size_2, T *output, size_t count) {
|
|
|
|
|
__global__ void UniformIntKernel(int seed, curandState *globalState, T *input1, size_t input_size_1,
|
|
|
|
|
T *input2, size_t input_size_2, T *output, size_t count) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
|
|
|
|
input1[i] = (input_size_1 == 1 ? input1[0] : input1[i]);
|
|
|
|
|
input2[i] = (input_size_2 == 1 ? input2[0] : input2[i]);
|
|
|
|
|
curand_init(seed, i, 0, &globalState[i]);
|
|
|
|
|
output[i] = curand_uniform(&globalState[i]) * (input2[i] - input1[i]) + input1[i];
|
|
|
|
|
output[i] = (T)(curand_uniform(&globalState[i])) * (input2[0] - input1[0]) + input1[0];
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void UniformRealKernel(int seed, curandState *globalState, T *output, size_t count) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
|
|
|
|
curand_init(seed, i, 0, &globalState[i]);
|
|
|
|
|
output[i] = (T)curand_uniform(&globalState[i]);
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
@ -51,16 +58,46 @@ void StandardNormal(int seed, int seed2, curandState *globalState, T *output, si
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
void UniformReal(int seed, curandState *globalState, T *input1, size_t input_size_1,
|
|
|
|
|
T *input2, size_t input_size_2, T *output, size_t count, cudaStream_t cuda_stream) {
|
|
|
|
|
seed = (seed == 0 ? time(NULL):seed);
|
|
|
|
|
UniformKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>
|
|
|
|
|
(seed, globalState, input1, input_size_1, input2, input_size_2, output, count);
|
|
|
|
|
void UniformInt(int seed, int seed2, curandState *globalState, T *input1, size_t input_size_1,
|
|
|
|
|
T *input2, size_t input_size_2, T *output, size_t count, cudaStream_t cuda_stream) {
|
|
|
|
|
int RNG_seed = 0;
|
|
|
|
|
if (seed2 != 0) {
|
|
|
|
|
RNG_seed = seed2;
|
|
|
|
|
} else if (seed != 0) {
|
|
|
|
|
RNG_seed = seed;
|
|
|
|
|
} else {
|
|
|
|
|
RNG_seed = time(NULL);
|
|
|
|
|
}
|
|
|
|
|
UniformIntKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>
|
|
|
|
|
(RNG_seed, globalState, input1, input_size_1, input2, input_size_2, output, count);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
void UniformReal(int seed, int seed2, curandState *globalState, T *output, size_t count, cudaStream_t cuda_stream) {
|
|
|
|
|
int RNG_seed = 0;
|
|
|
|
|
if (seed2 != 0) {
|
|
|
|
|
RNG_seed = seed2;
|
|
|
|
|
} else if (seed != 0) {
|
|
|
|
|
RNG_seed = seed;
|
|
|
|
|
} else {
|
|
|
|
|
RNG_seed = time(NULL);
|
|
|
|
|
}
|
|
|
|
|
UniformRealKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(RNG_seed, globalState, output, count);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template void StandardNormal<float>(int seed, int seed2, curandState *globalState,
|
|
|
|
|
float *output, size_t count, cudaStream_t cuda_stream);
|
|
|
|
|
template void UniformReal<float>(int seed, curandState *globalState, float *input1, size_t input_size_1,
|
|
|
|
|
float *input2, size_t input_size_2, float *output, size_t count,
|
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
|
template void StandardNormal<int>(int seed, int seed2, curandState *globalState,
|
|
|
|
|
int *output, size_t count, cudaStream_t cuda_stream);
|
|
|
|
|
template void UniformInt<float>(int seed, int seed2, curandState *globalState, float *input1, size_t input_size_1,
|
|
|
|
|
float *input2, size_t input_size_2, float *output, size_t count,
|
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
|
template void UniformInt<int>(int seed, int seed2, curandState *globalState, int *input1, size_t input_size_1,
|
|
|
|
|
int *input2, size_t input_size_2, int *output, size_t count,
|
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
|
template void UniformReal<float>(int seed, int seed2, curandState *globalState,
|
|
|
|
|
float *output, size_t count, cudaStream_t cuda_stream);
|
|
|
|
|
template void UniformReal<int>(int seed, int seed2, curandState *globalState,
|
|
|
|
|
int *output, size_t count, cudaStream_t cuda_stream);
|
|
|
|
|