|
|
@ -15,7 +15,7 @@ limitations under the License. */
|
|
|
|
#include <algorithm>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
#include <vector>
|
|
|
|
#include "paddle/fluid/framework/mixed_vector.h"
|
|
|
|
#include "paddle/fluid/framework/mixed_vector.h"
|
|
|
|
#include "paddle/fluid/operators/math/concat.h"
|
|
|
|
#include "paddle/fluid/operators/math/concat_and_split.h"
|
|
|
|
#include "paddle/fluid/platform/cuda_primitives.h"
|
|
|
|
#include "paddle/fluid/platform/cuda_primitives.h"
|
|
|
|
#include "paddle/fluid/platform/float16.h"
|
|
|
|
#include "paddle/fluid/platform/float16.h"
|
|
|
|
|
|
|
|
|
|
|
@ -24,7 +24,7 @@ namespace operators {
|
|
|
|
namespace math {
|
|
|
|
namespace math {
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
|
|
|
|
__global__ void ConcatKernel(T** inputs, const int* input_cols, int col_size,
|
|
|
|
const int output_rows, const int output_cols,
|
|
|
|
const int output_rows, const int output_cols,
|
|
|
|
T* output) {
|
|
|
|
T* output) {
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
@ -50,7 +50,7 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
|
|
|
|
__global__ void ConcatKernel(T** inputs_data, const int fixed_in_col,
|
|
|
|
const int out_rows, const int out_cols,
|
|
|
|
const int out_rows, const int out_cols,
|
|
|
|
T* output_data) {
|
|
|
|
T* output_data) {
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
@ -67,7 +67,7 @@ __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__global__ void KernelConcatGrad(const T* input_data, const int in_row,
|
|
|
|
__global__ void SplitKernel(const T* input_data, const int in_row,
|
|
|
|
const int in_col, const int* out_cols,
|
|
|
|
const int in_col, const int* out_cols,
|
|
|
|
int out_cols_size, T** outputs_data) {
|
|
|
|
int out_cols_size, T** outputs_data) {
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
@ -94,7 +94,7 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__global__ void KernelConcatGrad(const T* input_data, const int in_row,
|
|
|
|
__global__ void SplitKernel(const T* input_data, const int in_row,
|
|
|
|
const int in_col, const int fixed_out_col,
|
|
|
|
const int in_col, const int fixed_out_col,
|
|
|
|
T** outputs_data) {
|
|
|
|
T** outputs_data) {
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
@ -170,11 +170,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
|
|
|
|
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
|
|
|
|
|
|
|
|
|
|
|
|
if (sameShape) {
|
|
|
|
if (sameShape) {
|
|
|
|
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
dev_ins_data, in_col, out_row, out_col, output->data<T>());
|
|
|
|
dev_ins_data, in_col, out_row, out_col, output->data<T>());
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
|
|
|
|
const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
|
|
|
|
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
|
|
|
|
dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
|
|
|
|
out_row, out_col, output->data<T>());
|
|
|
|
out_row, out_col, output->data<T>());
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -189,7 +189,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
* each dimension must be the same, except the axis dimension.
|
|
|
|
* each dimension must be the same, except the axis dimension.
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
class ConcatGradFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
class SplitFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
public:
|
|
|
|
public:
|
|
|
|
void operator()(const platform::CUDADeviceContext& context,
|
|
|
|
void operator()(const platform::CUDADeviceContext& context,
|
|
|
|
const framework::Tensor& input,
|
|
|
|
const framework::Tensor& input,
|
|
|
@ -248,11 +248,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
|
|
|
|
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
|
|
|
|
|
|
|
|
|
|
|
|
if (sameShape) {
|
|
|
|
if (sameShape) {
|
|
|
|
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
|
|
|
|
input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
|
|
|
|
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
|
|
|
|
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
|
|
|
|
input.data<T>(), in_row, in_col, dev_outs_col_data,
|
|
|
|
input.data<T>(), in_row, in_col, dev_outs_col_data,
|
|
|
|
static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
|
|
|
|
static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -264,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
|
|
|
|
|
|
|
|
|
|
|
|
#define DEFINE_FUNCTOR(type) \
|
|
|
|
#define DEFINE_FUNCTOR(type) \
|
|
|
|
template class ConcatFunctor<platform::CUDADeviceContext, type>; \
|
|
|
|
template class ConcatFunctor<platform::CUDADeviceContext, type>; \
|
|
|
|
template class ConcatGradFunctor<platform::CUDADeviceContext, type>
|
|
|
|
template class SplitFunctor<platform::CUDADeviceContext, type>
|
|
|
|
|
|
|
|
|
|
|
|
FOR_ALL_TYPES(DEFINE_FUNCTOR);
|
|
|
|
FOR_ALL_TYPES(DEFINE_FUNCTOR);
|
|
|
|
|
|
|
|
|