|
|
@ -18,6 +18,7 @@
|
|
|
|
#include "paddle/fluid/framework/operator.h"
|
|
|
|
#include "paddle/fluid/framework/operator.h"
|
|
|
|
#include "paddle/fluid/framework/var_type.h"
|
|
|
|
#include "paddle/fluid/framework/var_type.h"
|
|
|
|
#include "paddle/fluid/operators/math/math_function.h"
|
|
|
|
#include "paddle/fluid/operators/math/math_function.h"
|
|
|
|
|
|
|
|
#include "paddle/fluid/platform/device_memory_aligment.h"
|
|
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
namespace paddle {
|
|
|
|
namespace operators {
|
|
|
|
namespace operators {
|
|
|
@ -26,7 +27,7 @@ static framework::proto::VarType::Type kDefaultDtype =
|
|
|
|
framework::proto::VarType::Type::VarType_Type_BOOL;
|
|
|
|
framework::proto::VarType::Type::VarType_Type_BOOL;
|
|
|
|
|
|
|
|
|
|
|
|
template <typename DeviceContext, typename T>
|
|
|
|
template <typename DeviceContext, typename T>
|
|
|
|
class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
|
|
|
|
class CoalesceTensorOp : public framework::OpKernel<T> {
|
|
|
|
public:
|
|
|
|
public:
|
|
|
|
void Compute(const framework::ExecutionContext &context) const override {
|
|
|
|
void Compute(const framework::ExecutionContext &context) const override {
|
|
|
|
auto &in_var_names = context.Inputs("Input");
|
|
|
|
auto &in_var_names = context.Inputs("Input");
|
|
|
@ -86,8 +87,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
|
|
|
|
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
|
|
|
|
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
|
|
|
|
&sub_tensor);
|
|
|
|
&sub_tensor);
|
|
|
|
|
|
|
|
|
|
|
|
offset +=
|
|
|
|
offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
|
|
|
|
Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
|
|
|
|
size_of_dtype;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (context.Attr<bool>("set_constant")) {
|
|
|
|
} else if (context.Attr<bool>("set_constant")) {
|
|
|
|
math::SetConstant<DeviceContext, T> set_constant;
|
|
|
|
math::SetConstant<DeviceContext, T> set_constant;
|
|
|
@ -106,7 +107,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
|
|
|
|
->ShareDataWith(fused_tensor->Slice(
|
|
|
|
->ShareDataWith(fused_tensor->Slice(
|
|
|
|
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
|
|
|
|
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
|
|
|
|
.Resize(dim);
|
|
|
|
.Resize(dim);
|
|
|
|
len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
|
|
|
|
len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
|
|
|
|
|
|
|
|
size_of_dtype;
|
|
|
|
offset += len;
|
|
|
|
offset += len;
|
|
|
|
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
|
|
|
|
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
|
|
|
|
<< " address: " << out_tensors[i]->data<void>() << ", ";
|
|
|
|
<< " address: " << out_tensors[i]->data<void>() << ", ";
|
|
|
@ -115,19 +117,6 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
private:
|
|
|
|
// Note(zcd): Addresses should be aligned, otherwise, the results may have
|
|
|
|
|
|
|
|
// diff.
|
|
|
|
|
|
|
|
size_t Alignment(size_t size, const platform::Place &place) const {
|
|
|
|
|
|
|
|
// Allow to allocate the minimum chunk size is 4 KB.
|
|
|
|
|
|
|
|
size_t alignment = 1 << 12;
|
|
|
|
|
|
|
|
if (platform::is_gpu_place(place)) {
|
|
|
|
|
|
|
|
// Allow to allocate the minimum chunk size is 256 B.
|
|
|
|
|
|
|
|
alignment = 1 << 8;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t remaining = size % alignment;
|
|
|
|
|
|
|
|
return remaining == 0 ? size : size + (alignment - remaining);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void GetMemSizeAndDtype(
|
|
|
|
void GetMemSizeAndDtype(
|
|
|
|
const std::vector<const framework::LoDTensor *> &lod_tensors,
|
|
|
|
const std::vector<const framework::LoDTensor *> &lod_tensors,
|
|
|
|
const std::vector<std::string> var_names, size_t *numel,
|
|
|
|
const std::vector<std::string> var_names, size_t *numel,
|
|
|
@ -156,7 +145,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
|
|
|
|
PADDLE_ENFORCE_GT(size, 0);
|
|
|
|
PADDLE_ENFORCE_GT(size, 0);
|
|
|
|
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
|
|
|
|
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
|
|
|
|
<< "), ";
|
|
|
|
<< "), ";
|
|
|
|
*numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) /
|
|
|
|
*numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
|
|
|
|
|
|
|
|
place) /
|
|
|
|
size_of_dtype;
|
|
|
|
size_of_dtype;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -176,17 +166,17 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
|
|
|
|
void Make() override {
|
|
|
|
void Make() override {
|
|
|
|
AddInput("Input",
|
|
|
|
AddInput("Input",
|
|
|
|
"(vector<LoDTensor>) The input tensors of"
|
|
|
|
"(vector<LoDTensor>) The input tensors of"
|
|
|
|
" alloc_continuous_space operator.")
|
|
|
|
" coalesce_tensor operator.")
|
|
|
|
.AsDuplicable();
|
|
|
|
.AsDuplicable();
|
|
|
|
AddOutput("Output",
|
|
|
|
AddOutput("Output",
|
|
|
|
"(vector<LoDTensor>) The output "
|
|
|
|
"(vector<LoDTensor>) The output "
|
|
|
|
"tensors of alloc_continuous_space operator. And the address "
|
|
|
|
"tensors of coalesce_tensor operator. And the address "
|
|
|
|
"of output tensors are continuous, they are sliced from the "
|
|
|
|
"of output tensors are continuous, they are sliced from the "
|
|
|
|
"tensor of FusedOutput.")
|
|
|
|
"tensor of FusedOutput.")
|
|
|
|
.AsDuplicable();
|
|
|
|
.AsDuplicable();
|
|
|
|
AddOutput("FusedOutput",
|
|
|
|
AddOutput("FusedOutput",
|
|
|
|
"(LoDTensor) The output tensor "
|
|
|
|
"(LoDTensor) The output tensor "
|
|
|
|
"of alloc_continuous_space operator. And the tensors of"
|
|
|
|
"of coalesce_tensor operator. And the tensors of"
|
|
|
|
" Output is sliced from the tensor of FusedOutput.");
|
|
|
|
" Output is sliced from the tensor of FusedOutput.");
|
|
|
|
AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
|
|
|
|
AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
|
|
|
|
.SetDefault(false);
|
|
|
|
.SetDefault(false);
|
|
|
@ -204,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
|
|
|
|
AddComment(R"DOC(
|
|
|
|
AddComment(R"DOC(
|
|
|
|
AllocContinuousSpace Operator.
|
|
|
|
AllocContinuousSpace Operator.
|
|
|
|
|
|
|
|
|
|
|
|
alloc_continuous_space is used to make the address of Output
|
|
|
|
coalesce_tensor is used to make the address of Output
|
|
|
|
continuous according to the Input. This Op will alloc a big tensor
|
|
|
|
continuous according to the Input. This Op will alloc a big tensor
|
|
|
|
according to the tensors of Input, the dtype is the same with those input tensors,
|
|
|
|
according to the tensors of Input, the dtype is the same with those input tensors,
|
|
|
|
the size is the sum of those input tensors' numel, and the dim of the big
|
|
|
|
the size is the sum of those input tensors' numel, and the dim of the big
|
|
|
@ -213,7 +203,7 @@ The tensors of Output are sliced from the tensor of FusedOutput.
|
|
|
|
Note that, the dtype of Input should be the same, and the dim of Input
|
|
|
|
Note that, the dtype of Input should be the same, and the dim of Input
|
|
|
|
and Output should equal.
|
|
|
|
and Output should equal.
|
|
|
|
The tensors of Input and Output could be the same or different. And
|
|
|
|
The tensors of Input and Output could be the same or different. And
|
|
|
|
alloc_continuous_space allows copying the value of Input to Output, or
|
|
|
|
coalesce_tensor allows copying the value of Input to Output, or
|
|
|
|
setting the Output with a constant value.
|
|
|
|
setting the Output with a constant value.
|
|
|
|
|
|
|
|
|
|
|
|
)DOC");
|
|
|
|
)DOC");
|
|
|
@ -223,27 +213,22 @@ setting the Output with a constant value.
|
|
|
|
} // namespace operators
|
|
|
|
} // namespace operators
|
|
|
|
} // namespace paddle
|
|
|
|
} // namespace paddle
|
|
|
|
|
|
|
|
|
|
|
|
REGISTER_OPERATOR(alloc_continuous_space,
|
|
|
|
REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp,
|
|
|
|
paddle::operators::AllocContinuousSpaceOp,
|
|
|
|
|
|
|
|
paddle::operators::AllocContinuousSpaceOpMaker);
|
|
|
|
paddle::operators::AllocContinuousSpaceOpMaker);
|
|
|
|
namespace ops = paddle::operators;
|
|
|
|
namespace ops = paddle::operators;
|
|
|
|
namespace plat = paddle::platform;
|
|
|
|
namespace plat = paddle::platform;
|
|
|
|
REGISTER_OP_CPU_KERNEL(
|
|
|
|
REGISTER_OP_CPU_KERNEL(
|
|
|
|
alloc_continuous_space,
|
|
|
|
coalesce_tensor,
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, plat::float16>,
|
|
|
|
plat::float16>,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, int>,
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, int>,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, float>,
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, float>,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, double>);
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext,
|
|
|
|
|
|
|
|
double>);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
REGISTER_OP_CUDA_KERNEL(
|
|
|
|
REGISTER_OP_CUDA_KERNEL(
|
|
|
|
alloc_continuous_space,
|
|
|
|
coalesce_tensor,
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, plat::float16>,
|
|
|
|
plat::float16>,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, int>,
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, int>,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, float>,
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, float>,
|
|
|
|
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, double>);
|
|
|
|
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext,
|
|
|
|
|
|
|
|
double>);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
#endif
|