|
|
|
@ -14,6 +14,7 @@
|
|
|
|
|
|
|
|
|
|
#include "paddle/fluid/operators/reader/buffered_reader.h"
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include "paddle/fluid/framework/data_type.h"
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
|
namespace operators {
|
|
|
|
@ -24,6 +25,12 @@ BufferedReader::~BufferedReader() {
|
|
|
|
|
position_.front().wait();
|
|
|
|
|
position_.pop();
|
|
|
|
|
}
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
if (platform::is_gpu_place(place_)) {
|
|
|
|
|
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
|
|
|
|
|
PADDLE_ENFORCE(cudaStreamDestroy(stream));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BufferedReader::BufferedReader(
|
|
|
|
@ -33,6 +40,12 @@ BufferedReader::BufferedReader(
|
|
|
|
|
thread_pool_(1),
|
|
|
|
|
place_(place),
|
|
|
|
|
buffer_size_(buffer_size) {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
if (platform::is_gpu_place(place_)) {
|
|
|
|
|
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
|
|
|
|
|
PADDLE_ENFORCE(cudaStreamCreate(&stream));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
cpu_buffer_.resize(buffer_size);
|
|
|
|
|
gpu_buffer_.resize(buffer_size);
|
|
|
|
|
ReadTillBufferFullAsync();
|
|
|
|
@ -54,14 +67,39 @@ void BufferedReader::ReadAsync(size_t i) {
|
|
|
|
|
return -1UL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
// NOTE(liangdun): using async copy instead of TensorCopySync
|
|
|
|
|
// TensorCopySync would block other stream
|
|
|
|
|
if (platform::is_gpu_place(place_)) {
|
|
|
|
|
TensorVec &gpu = gpu_buffer_[i];
|
|
|
|
|
gpu.resize(cpu.size());
|
|
|
|
|
for (size_t i = 0; i < cpu.size(); ++i) {
|
|
|
|
|
framework::TensorCopySync(cpu[i], place_, &gpu[i]);
|
|
|
|
|
gpu[i].Resize(cpu[i].dims());
|
|
|
|
|
gpu[i].set_layout(cpu[i].layout());
|
|
|
|
|
auto cpu_place = cpu[i].place();
|
|
|
|
|
auto cpu_ptr = cpu[i].data<void>();
|
|
|
|
|
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
|
|
|
|
|
auto size =
|
|
|
|
|
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
|
|
|
|
|
if (platform::is_cuda_pinned_place(cpu_place))
|
|
|
|
|
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
|
|
|
|
|
boost::get<platform::CUDAPinnedPlace>(cpu_place),
|
|
|
|
|
cpu_ptr, size, stream);
|
|
|
|
|
else if ((platform::is_gpu_place(cpu_place)))
|
|
|
|
|
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
|
|
|
|
|
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
|
|
|
|
|
size, stream);
|
|
|
|
|
else
|
|
|
|
|
// if cpu place is not pinned, async copy is slower than sync copy,
|
|
|
|
|
// so we use sync copy instead.
|
|
|
|
|
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
|
|
|
|
|
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
|
|
|
|
|
0);
|
|
|
|
|
gpu[i].set_lod(cpu[i].lod());
|
|
|
|
|
}
|
|
|
|
|
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
return i;
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|