|
|
|
|
@ -101,6 +101,122 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
|
|
|
|
|
output_value[frame_idx] = r_output;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* threads(tile_size, 1)
|
|
|
|
|
* grid(frame_blocks, 1)
|
|
|
|
|
*/
|
|
|
|
|
template <class T, int Tiled_size>
|
|
|
|
|
__global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
|
|
|
|
|
T *gate_weight, T *reset_output,
|
|
|
|
|
int frame_size,
|
|
|
|
|
ActivationType active_node) {
|
|
|
|
|
T xt_0 = 0.0f;
|
|
|
|
|
T a0 = 0.0f;
|
|
|
|
|
T c0 = 0.0f;
|
|
|
|
|
T b0[Tiled_size];
|
|
|
|
|
|
|
|
|
|
int COL = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
int Tiled_mask = ((1 << Tiled_size) - 1);
|
|
|
|
|
// Tiled matrix multiply using register shift, faster than sm.
|
|
|
|
|
if (prev_output_value) {
|
|
|
|
|
for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) {
|
|
|
|
|
a0 = 0;
|
|
|
|
|
if ((threadIdx.x + k * Tiled_size) < frame_size) {
|
|
|
|
|
a0 = prev_output_value[threadIdx.x + (k * Tiled_size)];
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < Tiled_size; i++) {
|
|
|
|
|
if (COL < frame_size * 2 && (i + k * Tiled_size) < frame_size) {
|
|
|
|
|
b0[i] = gate_weight[(i + k * Tiled_size) * frame_size * 2 + COL];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < Tiled_size; ++i) {
|
|
|
|
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
|
|
|
|
c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
|
|
|
|
|
#else
|
|
|
|
|
c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
if (COL < frame_size * 2) {
|
|
|
|
|
xt_0 = gate_value[COL];
|
|
|
|
|
c0 += xt_0;
|
|
|
|
|
c0 = forward::activation(c0, active_node);
|
|
|
|
|
gate_value[COL] = c0;
|
|
|
|
|
if (frame_size <= COL && COL < frame_size * 2) {
|
|
|
|
|
T htp_0 = 0.0;
|
|
|
|
|
if (prev_output_value) {
|
|
|
|
|
htp_0 = prev_output_value[COL - frame_size];
|
|
|
|
|
}
|
|
|
|
|
reset_output[COL - frame_size] = c0 * htp_0;
|
|
|
|
|
} else if (COL < frame_size) {
|
|
|
|
|
gate_value[COL] = c0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* threads(tile_size, 1)
|
|
|
|
|
* grid(frame_blocks, 1)
|
|
|
|
|
*/
|
|
|
|
|
template <class T, int Tiled_size>
|
|
|
|
|
__global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
|
|
|
|
|
T *output_value, T *gate_value,
|
|
|
|
|
T *reset_value, int frame_size,
|
|
|
|
|
ActivationType act_node,
|
|
|
|
|
bool origin_mode) {
|
|
|
|
|
int COL = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
|
|
|
|
|
T a0 = 0.0f;
|
|
|
|
|
T b0[Tiled_size];
|
|
|
|
|
T c0 = 0.0f;
|
|
|
|
|
|
|
|
|
|
int Tiled_mask = ((1 << Tiled_size) - 1);
|
|
|
|
|
//- Tiled matrix multiply with register shift
|
|
|
|
|
if (prev_out_value) {
|
|
|
|
|
for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) {
|
|
|
|
|
a0 = 0;
|
|
|
|
|
if ((threadIdx.x + k * Tiled_size) < frame_size) {
|
|
|
|
|
a0 = reset_value[threadIdx.x + (k * Tiled_size)];
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < Tiled_size; i++) {
|
|
|
|
|
if (COL < frame_size && (i + k * Tiled_size) < frame_size) {
|
|
|
|
|
b0[i] = gate_weight[(i + k * Tiled_size) * frame_size + COL];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < Tiled_size; ++i) {
|
|
|
|
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
|
|
|
|
c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
|
|
|
|
|
#else
|
|
|
|
|
c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
if (COL < frame_size) {
|
|
|
|
|
T xt_0 = gate_value[COL + 2 * frame_size];
|
|
|
|
|
T gta_0 = gate_value[COL];
|
|
|
|
|
T htp_0 = 0;
|
|
|
|
|
if (prev_out_value) htp_0 = prev_out_value[COL];
|
|
|
|
|
c0 += xt_0;
|
|
|
|
|
c0 = forward::activation(c0, act_node);
|
|
|
|
|
gate_value[COL + 2 * frame_size] = c0;
|
|
|
|
|
if (origin_mode) {
|
|
|
|
|
output_value[COL] = htp_0 * gta_0 + (1 - gta_0) * c0;
|
|
|
|
|
} else {
|
|
|
|
|
output_value[COL] = c0 * gta_0 + (1 - gta_0) * htp_0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* threads(frame_per_block, batch_per_block)
|
|
|
|
|
* grid(frame_blocks, batch_blocks)
|
|
|
|
|
|