|
|
|
@ -17,6 +17,7 @@ limitations under the License. */
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include "paddle/framework/ddim.h"
|
|
|
|
|
#include "paddle/framework/op_registry.h"
|
|
|
|
|
#include "paddle/operators/strided_memcpy.h"
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
|
namespace operators {
|
|
|
|
@ -32,34 +33,13 @@ class ConcatKernel : public framework::OpKernel<T> {
|
|
|
|
|
out->mutable_data<T>(place);
|
|
|
|
|
|
|
|
|
|
auto out_stride = framework::stride_numel(out->dims());
|
|
|
|
|
int64_t before = out_stride[0] / out_stride[axis];
|
|
|
|
|
int64_t out_after = out_stride[axis];
|
|
|
|
|
|
|
|
|
|
size_t output_offset = 0;
|
|
|
|
|
for (auto* in : ins) {
|
|
|
|
|
auto in_stride = framework::stride_numel(in->dims());
|
|
|
|
|
int64_t in_after = in_stride[axis];
|
|
|
|
|
for (int64_t i = 0; i < before; ++i) {
|
|
|
|
|
if (platform::is_cpu_place(place)) {
|
|
|
|
|
auto& cpu_place = boost::get<platform::CPUPlace>(place);
|
|
|
|
|
memory::Copy(
|
|
|
|
|
cpu_place, out->data<T>() + output_offset + i * out_after,
|
|
|
|
|
cpu_place, in->data<T>() + i * in_after, sizeof(T) * in_after);
|
|
|
|
|
} else {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
auto& gpu_place = boost::get<platform::CUDAPlace>(place);
|
|
|
|
|
auto& cuda_ctx =
|
|
|
|
|
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
|
|
|
|
|
memory::Copy(gpu_place, out->data<T>() +
|
|
|
|
|
output_offset + i * out_after,
|
|
|
|
|
gpu_place, in->data<T>() + i * in_after,
|
|
|
|
|
sizeof(T) * in_after, cuda_ctx.stream()));
|
|
|
|
|
#else
|
|
|
|
|
PADDLE_THROW("Paddle is not compiled with GPU");
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
output_offset += in_after;
|
|
|
|
|
StridedNumelCopyWithAxis<T>(ctx, axis, out->data<T>() + output_offset,
|
|
|
|
|
out_stride, in->data<T>(), in_stride);
|
|
|
|
|
output_offset += in_stride[axis];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
@ -73,35 +53,13 @@ class ConcatGradKernel : public framework::OpKernel<T> {
|
|
|
|
|
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
|
|
|
|
|
size_t input_offset = 0;
|
|
|
|
|
auto in_stride = framework::stride_numel(in->dims());
|
|
|
|
|
auto place = ctx.GetPlace();
|
|
|
|
|
|
|
|
|
|
// numel before the specified axis
|
|
|
|
|
int64_t before = in_stride[0] / in_stride[axis];
|
|
|
|
|
int64_t in_after = in_stride[axis];
|
|
|
|
|
for (auto& out : outs) {
|
|
|
|
|
out->mutable_data<T>(ctx.GetPlace());
|
|
|
|
|
auto out_stride = framework::stride_numel(out->dims());
|
|
|
|
|
int64_t out_after = out_stride[axis];
|
|
|
|
|
for (int64_t i = 0; i < before; ++i) {
|
|
|
|
|
if (platform::is_cpu_place(place)) {
|
|
|
|
|
auto& cpu_place = boost::get<platform::CPUPlace>(place);
|
|
|
|
|
memory::Copy(cpu_place, out->data<T>() + i * out_after, cpu_place,
|
|
|
|
|
in->data<T>() + input_offset + i * in_after,
|
|
|
|
|
sizeof(T) * out_after);
|
|
|
|
|
} else {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
auto& gpu_place = boost::get<platform::CUDAPlace>(place);
|
|
|
|
|
auto& cuda_ctx =
|
|
|
|
|
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
|
|
|
|
|
memory::Copy(gpu_place, out->data<T>() + i * out_after, gpu_place,
|
|
|
|
|
in->data<T>() + input_offset + i * in_after,
|
|
|
|
|
sizeof(T) * out_after, cuda_ctx.stream());
|
|
|
|
|
#else
|
|
|
|
|
PADDLE_THROW("Paddle is not compiled with GPU");
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
input_offset += out_after;
|
|
|
|
|
StridedNumelCopyWithAxis<T>(ctx, axis, out->data<T>(), out_stride,
|
|
|
|
|
in->data<T>() + input_offset, in_stride);
|
|
|
|
|
input_offset += out_stride[axis];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|