|
|
|
@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
|
limitations under the License. */
|
|
|
|
|
|
|
|
|
|
#include "hl_base.h"
|
|
|
|
|
#include "ContextProjectionOp.h"
|
|
|
|
|
#include "hl_base.h"
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
|
|
|
|
|
@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
|
|
|
|
|
int block_size = blockDim.x;
|
|
|
|
|
int sequenceId = blockIdx.x;
|
|
|
|
|
int seq_start = sequence[sequenceId];
|
|
|
|
|
int seq_end = sequence[sequenceId+1];
|
|
|
|
|
int seq_end = sequence[sequenceId + 1];
|
|
|
|
|
real value = 0;
|
|
|
|
|
|
|
|
|
|
int instances = seq_end - seq_start + context_length - 1;
|
|
|
|
@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
|
|
|
|
|
} else if ((i + context_start) >= (seq_end - seq_start)) {
|
|
|
|
|
if (padding) {
|
|
|
|
|
value =
|
|
|
|
|
weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
|
|
|
|
|
input_dim + idx];
|
|
|
|
|
weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
|
|
|
|
|
input_dim +
|
|
|
|
|
idx];
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
|
|
|
|
|
int outx = (i - context_length) < 0 ? i : (context_length - 1);
|
|
|
|
|
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
|
|
|
|
|
real* output_r =
|
|
|
|
|
output + outy * input_dim * context_length + outx * input_dim;
|
|
|
|
|
output + outy * input_dim * context_length + outx * input_dim;
|
|
|
|
|
for (int j = outy; j < seq_end - seq_start; j++) {
|
|
|
|
|
output_r[idx] += value;
|
|
|
|
|
if (j - outy == outx) break;
|
|
|
|
@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
|
|
|
|
|
dim3 grid(blocks_x, blocks_y);
|
|
|
|
|
|
|
|
|
|
if (weight) {
|
|
|
|
|
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
|
|
|
|
|
(input, sequence, weight, output, input_dim,
|
|
|
|
|
context_length, context_start, begin_pad);
|
|
|
|
|
} else {
|
|
|
|
|
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
|
|
|
|
|
(input, sequence, weight, output, input_dim,
|
|
|
|
|
context_length, context_start, begin_pad);
|
|
|
|
|
KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
|
|
|
|
|
input,
|
|
|
|
|
sequence,
|
|
|
|
|
weight,
|
|
|
|
|
output,
|
|
|
|
|
input_dim,
|
|
|
|
|
context_length,
|
|
|
|
|
context_start,
|
|
|
|
|
begin_pad);
|
|
|
|
|
} else {
|
|
|
|
|
KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
|
|
|
|
|
input,
|
|
|
|
|
sequence,
|
|
|
|
|
weight,
|
|
|
|
|
output,
|
|
|
|
|
input_dim,
|
|
|
|
|
context_length,
|
|
|
|
|
context_start,
|
|
|
|
|
begin_pad);
|
|
|
|
|
}
|
|
|
|
|
CHECK_SYNC("hl_context_projection_forward failed");
|
|
|
|
|
}
|
|
|
|
@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
|
|
|
|
|
int block_size = blockDim.x;
|
|
|
|
|
int sequenceId = blockIdx.x;
|
|
|
|
|
int seq_start = sequence[sequenceId];
|
|
|
|
|
int seq_end = sequence[sequenceId+1];
|
|
|
|
|
int seq_end = sequence[sequenceId + 1];
|
|
|
|
|
real value = 0;
|
|
|
|
|
|
|
|
|
|
int instances = seq_end - seq_start + context_length - 1;
|
|
|
|
@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
|
|
|
|
|
int outx = (i - context_length) < 0 ? i : (context_length - 1);
|
|
|
|
|
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
|
|
|
|
|
real* output_r =
|
|
|
|
|
out + outy * input_dim * context_length + outx * input_dim;
|
|
|
|
|
out + outy * input_dim * context_length + outx * input_dim;
|
|
|
|
|
for (int j = outy; j < seq_end - seq_start; j++) {
|
|
|
|
|
value += output_r[idx];
|
|
|
|
|
if (j - outy == outx) break;
|
|
|
|
@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
|
|
|
|
|
int blocks_y = 1;
|
|
|
|
|
dim3 threads(block_size, 1);
|
|
|
|
|
dim3 grid(blocks_x, blocks_y);
|
|
|
|
|
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
|
|
|
|
|
(out_grad, sequence, input_grad, input_dim, context_length, context_start);
|
|
|
|
|
KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
|
|
|
|
|
out_grad, sequence, input_grad, input_dim, context_length, context_start);
|
|
|
|
|
CHECK_SYNC("hl_context_projection_backward_data failed");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
|
|
|
|
|
context_start);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<int THREADS_X, int THREADS_Y>
|
|
|
|
|
template <int THREADS_X, int THREADS_Y>
|
|
|
|
|
__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
|
|
|
|
|
const int* sequence,
|
|
|
|
|
real* w_grad,
|
|
|
|
@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
|
|
|
|
|
if (weight_idx < w_dim) {
|
|
|
|
|
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
|
|
|
|
|
int seq_start = sequence[seqId];
|
|
|
|
|
int seq_end = sequence[seqId+1];
|
|
|
|
|
output_r = const_cast<real*>(out_grad)
|
|
|
|
|
+ seq_start * w_dim * context_length;
|
|
|
|
|
int seq_end = sequence[seqId + 1];
|
|
|
|
|
output_r =
|
|
|
|
|
const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
|
|
|
|
|
|
|
|
|
|
if (context_start < 0) {
|
|
|
|
|
if (padId + context_start < 0) {
|
|
|
|
|
instanceId = padId;
|
|
|
|
|
} else {
|
|
|
|
|
// begin_pad > 0;
|
|
|
|
|
instanceId = (padId - begin_pad) +
|
|
|
|
|
(seq_end - seq_start) - context_start;
|
|
|
|
|
instanceId =
|
|
|
|
|
(padId - begin_pad) + (seq_end - seq_start) - context_start;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (padId + (seq_end - seq_start) < context_start) {
|
|
|
|
@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int outx = (instanceId - context_length) < 0 ?
|
|
|
|
|
instanceId : (context_length - 1);
|
|
|
|
|
int outy = (instanceId - context_length) < 0 ?
|
|
|
|
|
0 : (instanceId - (context_length - 1));
|
|
|
|
|
int outx =
|
|
|
|
|
(instanceId - context_length) < 0 ? instanceId : (context_length - 1);
|
|
|
|
|
int outy = (instanceId - context_length) < 0
|
|
|
|
|
? 0
|
|
|
|
|
: (instanceId - (context_length - 1));
|
|
|
|
|
output_r += outy * w_dim * context_length + outx * w_dim;
|
|
|
|
|
for (int j = outy; j < seq_end - seq_start; j++) {
|
|
|
|
|
value += output_r[weight_idx];
|
|
|
|
@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
|
|
|
|
|
}
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
|
|
|
|
|
for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
|
|
|
|
|
if (idy < stride) {
|
|
|
|
|
sum_s[idy][idx] += sum_s[idy + stride][idx];
|
|
|
|
|
}
|
|
|
|
@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
|
|
|
|
|
dim3 threads(threads_x, threads_y);
|
|
|
|
|
dim3 grid(blocks_x, 1);
|
|
|
|
|
|
|
|
|
|
KeContextProjectionBackwardWeight<32, 32>
|
|
|
|
|
<<< grid, threads, 0, STREAM_DEFAULT >>>
|
|
|
|
|
(out_grad, sequence, w_grad, num_sequences, w_dim,
|
|
|
|
|
context_length, context_start, begin_pad);
|
|
|
|
|
KeContextProjectionBackwardWeight<32,
|
|
|
|
|
32><<<grid, threads, 0, STREAM_DEFAULT>>>(
|
|
|
|
|
out_grad,
|
|
|
|
|
sequence,
|
|
|
|
|
w_grad,
|
|
|
|
|
num_sequences,
|
|
|
|
|
w_dim,
|
|
|
|
|
context_length,
|
|
|
|
|
context_start,
|
|
|
|
|
begin_pad);
|
|
|
|
|
CHECK_SYNC("hl_context_projection_backward_weight failed");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <>
|
|
|
|
|
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
|
|
|
|
|
const GpuMatrix& out_grad,
|
|
|
|
|
GpuMatrix& w_grad,
|
|
|
|
|
const GpuIVector& seq_vec,
|
|
|
|
|
size_t context_length,
|
|
|
|
|
int context_start,
|
|
|
|
|
size_t total_pad,
|
|
|
|
|
size_t begin_pad) {
|
|
|
|
|
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
|
|
|
|
|
GpuMatrix& w_grad,
|
|
|
|
|
const GpuIVector& seq_vec,
|
|
|
|
|
size_t context_length,
|
|
|
|
|
int context_start,
|
|
|
|
|
size_t total_pad,
|
|
|
|
|
size_t begin_pad) {
|
|
|
|
|
hl_context_projection_backward_weight(out_grad.getData(),
|
|
|
|
|
seq_vec.getData(),
|
|
|
|
|
w_grad.getData(),
|
|
|
|
@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
|
|
|
|
|
size_t begin_pad,
|
|
|
|
|
bool is_padding,
|
|
|
|
|
size_t total_pad) {
|
|
|
|
|
if (in_grad) {
|
|
|
|
|
ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
|
|
|
|
|
out_grad,
|
|
|
|
|
in_grad,
|
|
|
|
|
sequence,
|
|
|
|
|
context_length,
|
|
|
|
|
context_start);
|
|
|
|
|
}
|
|
|
|
|
if (is_padding && w_grad) {
|
|
|
|
|
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
|
|
|
|
|
out_grad,
|
|
|
|
|
w_grad,
|
|
|
|
|
sequence,
|
|
|
|
|
context_length,
|
|
|
|
|
context_start,
|
|
|
|
|
total_pad,
|
|
|
|
|
begin_pad);
|
|
|
|
|
if (in_grad) {
|
|
|
|
|
ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
|
|
|
|
|
out_grad, in_grad, sequence, context_length, context_start);
|
|
|
|
|
}
|
|
|
|
|
if (is_padding && w_grad) {
|
|
|
|
|
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
|
|
|
|
|
w_grad,
|
|
|
|
|
sequence,
|
|
|
|
|
context_length,
|
|
|
|
|
context_start,
|
|
|
|
|
total_pad,
|
|
|
|
|
begin_pad);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|