|
|
|
@ -20,18 +20,10 @@ namespace math {
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
|
|
|
|
|
const size_t num_seq,
|
|
|
|
|
const size_t seq_width) {
|
|
|
|
|
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
|
|
|
|
|
|
|
|
if (idx < lod[num_seq] * seq_width) {
|
|
|
|
|
size_t i = 0;
|
|
|
|
|
for (i = 0; i < num_seq; ++i) {
|
|
|
|
|
if (idx < lod[i + 1] * seq_width) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
seq[idx] *= scales[i];
|
|
|
|
|
if (threadIdx.x < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width) {
|
|
|
|
|
int idx = lod[blockIdx.x] * seq_width + threadIdx.x;
|
|
|
|
|
seq[idx] *= scales[blockIdx.x];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -39,18 +31,17 @@ template <typename T>
|
|
|
|
|
class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
public:
|
|
|
|
|
void operator()(const platform::CUDADeviceContext& context,
|
|
|
|
|
framework::LoDTensor& seq, const T* scales,
|
|
|
|
|
const size_t num_seq) {
|
|
|
|
|
auto lod = seq.lod();
|
|
|
|
|
const size_t seq_width = seq.dims()[1];
|
|
|
|
|
framework::LoDTensor& seq, const T* scales) {
|
|
|
|
|
const size_t level = 0;
|
|
|
|
|
auto lod = seq.lod();
|
|
|
|
|
const size_t num_seq = lod[level].size() - 1;
|
|
|
|
|
const size_t seq_width = seq.numel() / seq.dims()[0];
|
|
|
|
|
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
|
|
|
|
|
T* seq_data = seq.mutable_data<T>(context.GetPlace());
|
|
|
|
|
|
|
|
|
|
int threads = 1024;
|
|
|
|
|
int grid = (seq.numel() * seq_width + threads - 1) / threads;
|
|
|
|
|
SequenceScaleKernel<T><<<grid, threads, 0, context.stream()>>>(
|
|
|
|
|
seq_data, abs_offset_lod[level].data(), scales, num_seq, seq_width);
|
|
|
|
|
SequenceScaleKernel<T><<<num_seq, threads, 0, context.stream()>>>(
|
|
|
|
|
seq_data, abs_offset_lod[level].data(), scales, seq_width);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|