|
|
@ -19,7 +19,7 @@
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
|
|
|
|
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__global__ void Concat(const int size, const int input_num,
|
|
|
|
__global__ void Concat(const size_t size, const int input_num,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
int* len_axis, T** inputs, T* output) {
|
|
|
|
int* len_axis, T** inputs, T* output) {
|
|
|
|
for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
|
|
|
|
for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
|
|
|
@ -45,7 +45,7 @@ __global__ void Concat(const int size, const int input_num,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
void ConcatKernel(const int size, const int input_num,
|
|
|
|
void ConcatKernel(const size_t size, const int input_num,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
int* len_axis, T** inputs, T* output,
|
|
|
|
int* len_axis, T** inputs, T* output,
|
|
|
|
cudaStream_t cuda_stream) {
|
|
|
|
cudaStream_t cuda_stream) {
|
|
|
@ -55,15 +55,15 @@ void ConcatKernel(const int size, const int input_num,
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template void ConcatKernel(const int size, const int input_num,
|
|
|
|
template void ConcatKernel(const size_t size, const int input_num,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
int* len_axis, float** inputs, float* output,
|
|
|
|
int* len_axis, float** inputs, float* output,
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
template void ConcatKernel(const int size, const int input_num,
|
|
|
|
template void ConcatKernel(const size_t size, const int input_num,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
int* len_axis, int** inputs, int* output,
|
|
|
|
int* len_axis, int** inputs, int* output,
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
template void ConcatKernel(const int size, const int input_num,
|
|
|
|
template void ConcatKernel(const size_t size, const int input_num,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
const int all_size_before_axis, const int all_size_axis,
|
|
|
|
int* len_axis, half** inputs, half* output,
|
|
|
|
int* len_axis, half** inputs, half* output,
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|
cudaStream_t cuda_stream);
|
|
|
|