@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
}
}
template <typename T>
static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
const LoDTensor& x, LoDTensor* out,
const framework::Vector<size_t>& x_lod,
const framework::Vector<size_t>& ref_lod,
bool do_copy) {
auto out_data = out->data<T>();
auto x_data = x.data<T>();
auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
int x_item_length = x.numel() / x.dims()[0];
int out_offset = 0;
int num_copys = 0;
for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1];
int x_start = x_lod[i - 1];
int x_end = x_lod[i];
int x_seq_len = x_end - x_start;
if (repeat_num > 0) {
if (do_copy) {
int out_start = out_offset;
if (out->lod().size() == 1) {
out_start = out->lod()[0][out_offset];
}
for (int j = 0; j < repeat_num; j++) {
for (int k = 0; k < x_seq_len; k++) {
memory::Copy(
gpu_place,
out_data + (out_start + j * x_seq_len + k) * x_item_length,
gpu_place, x_data + (x_start + k) * x_item_length,
sizeof(T) * x_item_length, context.stream());
}
}
} else {
num_copys += repeat_num * x_seq_len;
}
}
out_offset += repeat_num;
}
return num_copys;
}
template <typename T>
struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
void operator()(
@ -95,22 +139,40 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out) {
int x_item_length = x.numel() / x.dims()[0];
framework::Vector<size_t> out_offset(x_lod.size());
GetOutputOffset(x_lod, ref_lod, &out_offset);
int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
int thread_y = 16;
int thread_z = 1024 / thread_x / thread_y;
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
int num_copys =
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (num_copys < 5) {
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
} else {
int x_item_length = x.numel() / x.dims()[0];
size_t x_lod_size = x_lod.size();
framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
GetOutputOffset(x_lod, ref_lod, &out_offset);
for (size_t i = 0; i < x_lod_size; ++i) {
out_offset[x_lod_size + i] = x_lod[i];
}
for (size_t i = 0; i < ref_lod.size(); ++i) {
out_offset[2 * x_lod_size + i] = ref_lod[i];
}
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), x_lod.CUDAData(context.GetPlace()),
ref_lod.CUDAData(context.GetPlace()),
out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
out->mutable_data<T>(context.GetPlace()));
const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
const size_t* x_lod_data = out_offset_data + x_lod_size;
const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
int thread_x =
std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
int thread_y = 16;
int thread_z = 1024 / thread_x / thread_y;
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
x_item_length, out->mutable_data<T>(context.GetPlace()));
}
}
};