|
|
|
@ -71,7 +71,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
|
|
|
|
|
|
|
|
|
|
auto seq_dims = seq.dims();
|
|
|
|
|
PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
|
|
|
|
|
PADDLE_ENFORCE_EQ(seq_dims[0],
|
|
|
|
|
static_cast<int64_t>(abs_offset_lod[level].back()),
|
|
|
|
|
"The first dimension of LoDTensor seq should be "
|
|
|
|
|
"equal to the sum of all sequences's length.");
|
|
|
|
|
|
|
|
|
@ -80,17 +81,17 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
"The input padding should be a 3-D Tensor of shape "
|
|
|
|
|
"[max_sequence_length, num_sequences, sequence_width].");
|
|
|
|
|
|
|
|
|
|
size_t max_sequence_length = MaximumSequenceLength(lod, level);
|
|
|
|
|
int64_t max_sequence_length = MaximumSequenceLength(lod, level);
|
|
|
|
|
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
|
|
|
|
|
"The first dimension of Tensor padding should be the "
|
|
|
|
|
"maximum length of all sequences in LoDTensor seq.");
|
|
|
|
|
|
|
|
|
|
const size_t num_sequences = abs_offset_lod[level].size() - 1;
|
|
|
|
|
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
|
|
|
|
|
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
|
|
|
|
|
"The second dimension of Tensor padding should be the "
|
|
|
|
|
"number of sequences in LoDTensor seq.");
|
|
|
|
|
|
|
|
|
|
const size_t sequence_width = seq.numel() / seq_dims[0];
|
|
|
|
|
const int64_t sequence_width = seq.numel() / seq_dims[0];
|
|
|
|
|
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
|
|
|
|
|
"The third dimension of Tensor padding should be the "
|
|
|
|
|
"width of sequence in LoDTensor seq.");
|
|
|
|
@ -101,7 +102,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const size_t kBlockSize = 512;
|
|
|
|
|
const int64_t kBlockSize = 512;
|
|
|
|
|
|
|
|
|
|
/* At least use 32 threads to copy sequence_width elements,
|
|
|
|
|
* and at least 8 elements for each thread.
|
|
|
|
@ -143,7 +144,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
|
|
|
|
|
|
|
|
|
|
auto seq_dims = seq.dims();
|
|
|
|
|
PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
|
|
|
|
|
PADDLE_ENFORCE_EQ(seq_dims[0],
|
|
|
|
|
static_cast<int64_t>(abs_offset_lod[level].back()),
|
|
|
|
|
"The first dimension of LoDTensor seq should be "
|
|
|
|
|
"equal to the sum of all sequences's length.");
|
|
|
|
|
|
|
|
|
@ -152,17 +154,17 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
"The input padding should be a 3-D Tensor of shape "
|
|
|
|
|
"[max_sequnece_length, num_sequences, sequence_width].");
|
|
|
|
|
|
|
|
|
|
size_t max_sequence_length = MaximumSequenceLength(lod, level);
|
|
|
|
|
int64_t max_sequence_length = MaximumSequenceLength(lod, level);
|
|
|
|
|
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
|
|
|
|
|
"The first dimension of Tensor padding should be "
|
|
|
|
|
"the maximum length of all sequences in LoDTensor seq.");
|
|
|
|
|
|
|
|
|
|
const size_t num_sequences = abs_offset_lod[level].size() - 1;
|
|
|
|
|
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
|
|
|
|
|
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
|
|
|
|
|
"The second dimension of Tensor padding should be "
|
|
|
|
|
"the number of sequences in LoDTensor seq.");
|
|
|
|
|
|
|
|
|
|
const size_t sequence_width = seq.numel() / seq_dims[0];
|
|
|
|
|
const int64_t sequence_width = seq.numel() / seq_dims[0];
|
|
|
|
|
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
|
|
|
|
|
"The third dimension of Tensor padding should be the "
|
|
|
|
|
"width of sequence in LoDTensor seq.");
|
|
|
|
@ -173,7 +175,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const size_t kBlockSize = 512;
|
|
|
|
|
const int64_t kBlockSize = 512;
|
|
|
|
|
|
|
|
|
|
/* At least use 32 threads to copy sequence_width elements,
|
|
|
|
|
* and at least 8 elements for each thread.
|
|
|
|
|