|
|
|
@ -75,9 +75,9 @@ void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, con
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
__global__ void FusedMomentumWeightDecayScaleMomentum(const size_t element_num, T *weight_decay, T *scale, T *variable,
|
|
|
|
|
T *accumulation, const T *learning_rate, const S *gradient,
|
|
|
|
|
const T *momentum) {
|
|
|
|
|
__global__ void FusedMomentumWeightDecayScaleKernel(const size_t element_num, T *weight_decay, T *scale, T *variable,
|
|
|
|
|
T *accumulation, const T *learning_rate, const S *gradient,
|
|
|
|
|
const T *momentum) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (element_num); i += blockDim.x * gridDim.x) {
|
|
|
|
|
T grad = (variable[i] * weight_decay[0] + static_cast<T>(gradient[i])) * scale[0];
|
|
|
|
|
accumulation[i] = momentum[0] * accumulation[i] + grad;
|
|
|
|
@ -91,13 +91,13 @@ void FusedWeightDecayScaleMomentum(const size_t element_num, T *weight_decay, T
|
|
|
|
|
cudaStream_t cuda_stream) {
|
|
|
|
|
size_t thread_per_block = 256;
|
|
|
|
|
size_t block_per_grid = (element_num + thread_per_block - 1) / thread_per_block;
|
|
|
|
|
FusedMomentumWeightDecayScaleMomentum<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
FusedMomentumWeightDecayScaleKernel<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
element_num, weight_decay, scale, variable, accumulation, learning_rate, gradient, momentum);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
__global__ void FusedMomentumScaleMomentum(const size_t element_num, T *scale, T *variable, T *accumulation,
|
|
|
|
|
const T *learning_rate, const S *gradient, const T *momentum) {
|
|
|
|
|
__global__ void FusedMomentumScaleKernel(const size_t element_num, T *scale, T *variable, T *accumulation,
|
|
|
|
|
const T *learning_rate, const S *gradient, const T *momentum) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (element_num); i += blockDim.x * gridDim.x) {
|
|
|
|
|
accumulation[i] = momentum[0] * accumulation[i] + static_cast<T>(gradient[i]) * scale[0];
|
|
|
|
|
variable[i] -= learning_rate[0] * accumulation[i];
|
|
|
|
@ -109,15 +109,33 @@ void FusedScaleMomentum(const size_t element_num, T *scale, T *variable, T *accu
|
|
|
|
|
const S *gradient, const T *momentum, cudaStream_t cuda_stream) {
|
|
|
|
|
size_t thread_per_block = 256;
|
|
|
|
|
size_t block_per_grid = (element_num + thread_per_block - 1) / thread_per_block;
|
|
|
|
|
FusedMomentumScaleMomentum<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
FusedMomentumScaleKernel<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
element_num, scale, variable, accumulation, learning_rate, gradient, momentum);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
__global__ void FusedWeightDecayMomentumKernel(const size_t element_num, T *weight_decay, T *variable, T *accumulation,
|
|
|
|
|
const T *learning_rate, const S *gradient, const T *momentum) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (element_num); i += blockDim.x * gridDim.x) {
|
|
|
|
|
T grad = variable[i] * weight_decay[0] + static_cast<T>(gradient[i]);
|
|
|
|
|
accumulation[i] = momentum[0] * accumulation[i] + grad;
|
|
|
|
|
variable[i] -= learning_rate[0] * accumulation[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
void FusedWeightDecayMomentum(const size_t element_num, T *weight_decay, T *variable, T *accumulation,
|
|
|
|
|
const T *learning_rate, const S *gradient, const T *momentum, cudaStream_t cuda_stream) {
|
|
|
|
|
size_t thread_per_block = 256;
|
|
|
|
|
size_t block_per_grid = (element_num + thread_per_block - 1) / thread_per_block;
|
|
|
|
|
FusedWeightDecayMomentumKernel<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
element_num, weight_decay, variable, accumulation, learning_rate, gradient, momentum);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// CombineFusedScaleMomentum
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
__global__ void CombineFusedMomentumScaleMomentum(const size_t num, const size_t *element_num,
|
|
|
|
|
T **scale, T **variable, T **accumulation,
|
|
|
|
|
T **learning_rate, S **gradient, T **momentum) {
|
|
|
|
|
__global__ void CombineFusedMomentumScaleKernel(const size_t num, const size_t *element_num, T **scale, T **variable,
|
|
|
|
|
T **accumulation, T **learning_rate, S **gradient, T **momentum) {
|
|
|
|
|
for (size_t idx = 0; idx < num; idx++) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (element_num[idx]); i += blockDim.x * gridDim.x) {
|
|
|
|
|
accumulation[idx][i] = momentum[idx][0] * accumulation[idx][i] + static_cast<T>(gradient[idx][i]) * scale[idx][0];
|
|
|
|
@ -127,22 +145,21 @@ __global__ void CombineFusedMomentumScaleMomentum(const size_t num, const size_t
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, T **scale,
|
|
|
|
|
T **variable, T **accumulation, T **learning_rate, S **gradient,
|
|
|
|
|
T **momentum, cudaStream_t cuda_stream) {
|
|
|
|
|
void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, T **scale, T **variable,
|
|
|
|
|
T **accumulation, T **learning_rate, S **gradient, T **momentum,
|
|
|
|
|
cudaStream_t cuda_stream) {
|
|
|
|
|
size_t thread_per_block = 256;
|
|
|
|
|
size_t block_per_grid = (max + thread_per_block - 1) / thread_per_block;
|
|
|
|
|
CombineFusedMomentumScaleMomentum<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
CombineFusedMomentumScaleKernel<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
num, elements, scale, variable, accumulation, learning_rate, gradient, momentum);
|
|
|
|
|
}
|
|
|
|
|
// end CombineFusedScaleMomentum
|
|
|
|
|
|
|
|
|
|
// CombineFusedWeightDecayScaleMomentum
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
__global__ void CombineFusedMomentumWeightDecayScaleMomentum(const size_t num, const size_t *element_num,
|
|
|
|
|
T **weight_decay, T **scale, T **variable,
|
|
|
|
|
T **accumulation, T **learning_rate, S **gradient,
|
|
|
|
|
T **momentum) {
|
|
|
|
|
__global__ void CombineFusedMomentumWeightDecayScaleKernel(const size_t num, const size_t *element_num,
|
|
|
|
|
T **weight_decay, T **scale, T **variable, T **accumulation,
|
|
|
|
|
T **learning_rate, S **gradient, T **momentum) {
|
|
|
|
|
for (size_t idx = 0; idx < num; idx++) {
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (element_num[idx]); i += blockDim.x * gridDim.x) {
|
|
|
|
|
T grad = (variable[idx][i] * weight_decay[idx][0] + static_cast<T>(gradient[idx][i])) * scale[idx][0];
|
|
|
|
@ -155,11 +172,10 @@ __global__ void CombineFusedMomentumWeightDecayScaleMomentum(const size_t num, c
|
|
|
|
|
template <typename T, typename S>
|
|
|
|
|
void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *element_num,
|
|
|
|
|
T **weight_decay, T **scale, T **variable, T **accumulation,
|
|
|
|
|
T **learning_rate, S **gradient, T **momentum,
|
|
|
|
|
cudaStream_t cuda_stream) {
|
|
|
|
|
T **learning_rate, S **gradient, T **momentum, cudaStream_t cuda_stream) {
|
|
|
|
|
size_t thread_per_block = 256;
|
|
|
|
|
size_t block_per_grid = (max + thread_per_block - 1) / thread_per_block;
|
|
|
|
|
CombineFusedMomentumWeightDecayScaleMomentum<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
CombineFusedMomentumWeightDecayScaleKernel<<<block_per_grid, thread_per_block, 0, cuda_stream>>>(
|
|
|
|
|
num, element_num, weight_decay, scale, variable, accumulation, learning_rate, gradient, momentum);
|
|
|
|
|
}
|
|
|
|
|
// end CombineFusedWeightDecayScaleMomentum
|
|
|
|
@ -186,6 +202,12 @@ template void FusedWeightDecayScaleMomentum(const size_t element_num, float *wei
|
|
|
|
|
template void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale,
|
|
|
|
|
float *variable, float *accumulation, const float *learning_rate,
|
|
|
|
|
const half *gradient, const float *momentum, cudaStream_t cuda_stream);
|
|
|
|
|
template void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable,
|
|
|
|
|
float *accumulation, const float *learning_rate, const float *gradient,
|
|
|
|
|
const float *momentum, cudaStream_t cuda_stream);
|
|
|
|
|
template void FusedWeightDecayMomentum(const size_t element_num, float *weight_decay, float *variable,
|
|
|
|
|
float *accumulation, const float *learning_rate, const half *gradient,
|
|
|
|
|
const float *momentum, cudaStream_t cuda_stream);
|
|
|
|
|
template void FusedScaleMomentum(const size_t element_num, float *scale, float *variable, float *accumulation,
|
|
|
|
|
const float *learning_rate, const float *gradient, const float *momentum,
|
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
@ -193,16 +215,16 @@ template void FusedScaleMomentum(const size_t element_num, float *scale, float *
|
|
|
|
|
const float *learning_rate, const half *gradient, const float *momentum,
|
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
|
template void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *elements,
|
|
|
|
|
float **weight_decay, float **scale, float **variable,
|
|
|
|
|
float **accumulation, float **learning_rate, float **gradient,
|
|
|
|
|
float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|
float **weight_decay, float **scale, float **variable,
|
|
|
|
|
float **accumulation, float **learning_rate, float **gradient,
|
|
|
|
|
float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|
template void CombineFusedWeightDecayScaleMomentum(const size_t max, const size_t num, const size_t *elements,
|
|
|
|
|
float **weight_decay, float **scale, float **variable,
|
|
|
|
|
float **accumulation, float **learning_rate, half **gradient,
|
|
|
|
|
float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|
float **weight_decay, float **scale, float **variable,
|
|
|
|
|
float **accumulation, float **learning_rate, half **gradient,
|
|
|
|
|
float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|
template void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, float **scale,
|
|
|
|
|
float **variable, float **accumulation, float **learning_rate,
|
|
|
|
|
float **gradient, float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|
float **variable, float **accumulation, float **learning_rate, float **gradient,
|
|
|
|
|
float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|
template void CombineFusedScaleMomentum(const size_t max, const size_t num, const size_t *elements, float **scale,
|
|
|
|
|
float **variable, float **accumulation, float **learning_rate,
|
|
|
|
|
half **gradient, float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|
float **variable, float **accumulation, float **learning_rate, half **gradient,
|
|
|
|
|
float **momentum, cudaStream_t cuda_stream);
|
|
|
|
|