|
|
|
@ -67,22 +67,6 @@ __global__ void SumSelectedRowsCUDAKernel(T **sr_in_out, int64_t N,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class T>
|
|
|
|
|
__global__ void SumAlign4CUDAKernel(const T *in_0, const T *in_1, T *out,
|
|
|
|
|
int64_t N) {
|
|
|
|
|
int id = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
for (int i = id; i < N / 4; i += blockDim.x * gridDim.x) {
|
|
|
|
|
const float4 *in0_4 = reinterpret_cast<float4 *>(in_0);
|
|
|
|
|
const float4 *in1_4 = reinterpret_cast<float4 *>(in_1);
|
|
|
|
|
float4 tmp;
|
|
|
|
|
tmp.x = in0_4[i].x + in1_4[i].x;
|
|
|
|
|
tmp.y = in0_4[i].y + in1_4[i].y;
|
|
|
|
|
tmp.z = in0_4[i].z + in1_4[i].z;
|
|
|
|
|
tmp.w = in0_4[i].w + in1_4[i].w;
|
|
|
|
|
reinterpret_cast<float4 *>(out)[i] = tmp;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class T>
|
|
|
|
|
void SumToLoDTensor(const framework::ExecutionContext &context) {
|
|
|
|
|
auto in_vars = context.MultiInputVar("X");
|
|
|
|
|