|
|
|
@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) {
|
|
|
|
|
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
|
|
|
|
|
buffers.push_back(buf.buffer);
|
|
|
|
|
}
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(stream_);
|
|
|
|
|
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
|
|
|
|
|
cudaStreamSynchronize(*stream_);
|
|
|
|
|
infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
|
|
|
|
|
cudaStreamSynchronize(stream_);
|
|
|
|
|
SetRuntimeBatch(batch_size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TensorRTEngine::~TensorRTEngine() {
|
|
|
|
|
cudaStreamSynchronize(*stream_);
|
|
|
|
|
cudaStreamSynchronize(stream_);
|
|
|
|
|
// clean buffer
|
|
|
|
|
for (auto &buf : buffers_) {
|
|
|
|
|
if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
|
|
|
|
@ -173,7 +172,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
|
|
|
|
|
auto &buf = buffer(name);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
|
|
|
|
|
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
|
|
|
|
|
cudaMemcpyDeviceToDevice, *stream_),
|
|
|
|
|
cudaMemcpyDeviceToDevice, stream_),
|
|
|
|
|
0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -194,7 +193,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
|
|
|
|
|
auto &buf = buffer(name);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
|
|
|
|
|
cudaMemcpyDeviceToHost, *stream_));
|
|
|
|
|
cudaMemcpyDeviceToHost, stream_));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Buffer &TensorRTEngine::buffer(const std::string &name) {
|
|
|
|
@ -211,12 +210,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
|
|
|
|
|
auto &buf = buffer(name);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(data);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(stream_);
|
|
|
|
|
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
|
|
|
|
|
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
|
|
|
|
|
buf.size = size;
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
|
|
|
|
|
cudaMemcpyHostToDevice, *stream_));
|
|
|
|
|
cudaMemcpyHostToDevice, stream_));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
|
|
|
|
@ -227,7 +225,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
|
|
|
|
|
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
|
|
|
|
|
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
|
|
|
|
|
cudaMemcpyDeviceToDevice, *stream_));
|
|
|
|
|
cudaMemcpyDeviceToDevice, stream_));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::SetITensor(const std::string &name,
|
|
|
|
|