|
|
|
@ -28,16 +28,35 @@ namespace distributed {
|
|
|
|
|
|
|
|
|
|
using VarMsg = sendrecv::VariableMessage;
|
|
|
|
|
|
|
|
|
|
static std::shared_ptr<memory::Allocation> GetCommunicationAllocationFromTensor(
|
|
|
|
|
const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
|
|
|
|
|
if (is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
void* GetVarPayLoad(const std::string varname, int64_t size) {
|
|
|
|
|
platform::CUDAPinnedPlace cuda_pinned;
|
|
|
|
|
return memory::Alloc(cuda_pinned, size);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
PADDLE_ENFORCE(is_gpu_place(tensor.place()));
|
|
|
|
|
auto& gpu_dev_ctx =
|
|
|
|
|
reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
|
|
|
|
|
auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
|
|
|
|
|
platform::CUDAPinnedPlace cuda_pinned;
|
|
|
|
|
auto result = memory::AllocShared(
|
|
|
|
|
cuda_pinned, copy_size, memory::allocation::Allocator::kCrossDevice);
|
|
|
|
|
|
|
|
|
|
void GetTensorPayload(framework::Variable* var,
|
|
|
|
|
const platform::DeviceContext& ctx, VarMsg* request,
|
|
|
|
|
void** payload, size_t* payload_size) {
|
|
|
|
|
memory::Copy(cuda_pinned, result->ptr(),
|
|
|
|
|
boost::get<platform::CUDAPlace>(tensor.place()),
|
|
|
|
|
reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
|
|
|
|
|
gpu_dev_ctx.stream());
|
|
|
|
|
|
|
|
|
|
ctx.Wait();
|
|
|
|
|
return result;
|
|
|
|
|
#else
|
|
|
|
|
return nullptr; // THIS SHOULD NOT HAPPENED.
|
|
|
|
|
#endif
|
|
|
|
|
} else {
|
|
|
|
|
return tensor.Holder();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
std::shared_ptr<memory::Allocation> GetTensorPayload(
|
|
|
|
|
framework::Variable* var, const platform::DeviceContext& ctx,
|
|
|
|
|
VarMsg* request) {
|
|
|
|
|
auto tensor = var->Get<framework::LoDTensor>();
|
|
|
|
|
// FIXME(wuyi): data types in send_recv.proto is copied from
|
|
|
|
|
// framework.proto
|
|
|
|
@ -56,31 +75,12 @@ void GetTensorPayload(framework::Variable* var,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
|
|
|
|
|
// platform::CUDAPinnedPlace cuda_pinned;
|
|
|
|
|
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
|
|
|
|
|
auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
|
|
|
|
|
*payload = GetVarPayLoad(request->varname(), copy_size);
|
|
|
|
|
|
|
|
|
|
platform::CUDAPinnedPlace cuda_pinned;
|
|
|
|
|
memory::Copy(cuda_pinned, *payload,
|
|
|
|
|
boost::get<platform::CUDAPlace>(tensor.place()),
|
|
|
|
|
reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
|
|
|
|
|
gpu_dev_ctx.stream());
|
|
|
|
|
|
|
|
|
|
ctx.Wait();
|
|
|
|
|
#endif
|
|
|
|
|
} else {
|
|
|
|
|
*payload = tensor.data<void>();
|
|
|
|
|
}
|
|
|
|
|
*payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
|
|
|
|
|
return GetCommunicationAllocationFromTensor(ctx, tensor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void GetSelectedRowsPayload(framework::Variable* var,
|
|
|
|
|
const platform::DeviceContext& ctx, VarMsg* request,
|
|
|
|
|
void** payload, size_t* payload_size) {
|
|
|
|
|
std::shared_ptr<memory::Allocation> GetSelectedRowsPayload(
|
|
|
|
|
framework::Variable* var, const platform::DeviceContext& ctx,
|
|
|
|
|
VarMsg* request) {
|
|
|
|
|
auto* slr = var->GetMutable<framework::SelectedRows>();
|
|
|
|
|
request->set_data_type(
|
|
|
|
|
static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
|
|
|
|
@ -92,23 +92,7 @@ void GetSelectedRowsPayload(framework::Variable* var,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto* tensor = slr->mutable_value();
|
|
|
|
|
if (platform::is_gpu_place(ctx.GetPlace())) {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
|
|
|
|
|
auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
|
|
|
|
|
*payload = GetVarPayLoad(request->varname(), copy_size);
|
|
|
|
|
|
|
|
|
|
platform::CUDAPinnedPlace cuda_pinned;
|
|
|
|
|
memory::Copy(cuda_pinned, *payload,
|
|
|
|
|
boost::get<platform::CUDAPlace>(tensor->place()),
|
|
|
|
|
reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
|
|
|
|
|
gpu_dev_ctx.stream());
|
|
|
|
|
ctx.Wait();
|
|
|
|
|
#endif
|
|
|
|
|
} else {
|
|
|
|
|
*payload = slr->mutable_value()->data<void>();
|
|
|
|
|
}
|
|
|
|
|
*payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
|
|
|
|
|
return GetCommunicationAllocationFromTensor(ctx, *tensor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace distributed
|
|
|
|
|