Async memory copy (#15013)

revert-15207-remove_op_handle_lock_and_fix_var
qingqing01 6 years ago committed by GitHub
parent 938705745e
commit 51a9fca323
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
inputs[i].data.length()); inputs[i].data.length());
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_); auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(), platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), inputs[i].data.length(), dev_ctx->stream());
0); // stream 0 for sync copy
#else #else
PADDLE_THROW("Not compile with CUDA, should not reach here."); PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif #endif

@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
inputs[i].data.length()); inputs[i].data.length());
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_); auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(), platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), inputs[i].data.length(), dev_ctx->stream());
0); // stream 0 for sync copy
#else #else
PADDLE_THROW("Not compile with CUDA, should not reach here."); PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif #endif

@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
framework::Tensor d_temp; framework::Tensor d_temp;
framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
// At least use 32 threads, at most 512 threads. // At least use 32 threads, at most 512 threads.
// blockx is multiple of 32. // blockx is multiple of 32.
int blockx = std::min( int blockx = std::min(
static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L); static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
512L);
int gridx = (feature_width * num_priors + blockx - 1) / blockx; int gridx = (feature_width * num_priors + blockx - 1) / blockx;
dim3 threads(blockx, 1); dim3 threads(blockx, 1);
dim3 grids(gridx, feature_height); dim3 grids(gridx, feature_height);

Loading…
Cancel
Save