|
|
|
@ -31,6 +31,7 @@ void TensorRTEngine::Build(const DescType& paddle_model) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::Execute(int batch_size) {
|
|
|
|
|
batch_size_ = batch_size;
|
|
|
|
|
std::vector<void *> buffers;
|
|
|
|
|
for (auto &buf : buffers_) {
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
|
|
|
|
@ -38,6 +39,7 @@ void TensorRTEngine::Execute(int batch_size) {
|
|
|
|
|
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
|
|
|
|
|
buffers.push_back(buf.buffer);
|
|
|
|
|
}
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(stream_);
|
|
|
|
|
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
|
|
|
|
|
cudaStreamSynchronize(*stream_);
|
|
|
|
|
}
|
|
|
|
@ -71,19 +73,24 @@ void TensorRTEngine::FreezeNetwork() {
|
|
|
|
|
// allocate GPU buffers.
|
|
|
|
|
buffers_.resize(buffer_sizes_.size());
|
|
|
|
|
for (auto &item : buffer_sizes_) {
|
|
|
|
|
// The output buffers are not set in the network building phrase, need to
|
|
|
|
|
// infer from the TesorRT network.
|
|
|
|
|
if (item.second == 0) {
|
|
|
|
|
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
|
|
|
|
|
auto dims = infer_engine_->getBindingDimensions(slot_offset);
|
|
|
|
|
item.second = kDataTypeSize[static_cast<int>(
|
|
|
|
|
infer_engine_->getBindingDataType(slot_offset))] *
|
|
|
|
|
analysis::AccuDims(dims.d, dims.nbDims);
|
|
|
|
|
PADDLE_ENFORCE_GT(item.second, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto &buf = buffer(item.first);
|
|
|
|
|
buf.max_size = item.second * max_batch_;
|
|
|
|
|
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
|
|
|
|
|
VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
|
|
|
|
|
<< buf.buffer;
|
|
|
|
|
buf.size = buf.max_size = item.second;
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
|
|
|
|
|
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
|
|
|
|
|
// buf.size will changed in the runtime.
|
|
|
|
|
buf.size = 0;
|
|
|
|
|
buf.device = DeviceType::GPU;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -155,15 +162,22 @@ void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
|
|
|
|
|
size_t max_size) {
|
|
|
|
|
// determine data size
|
|
|
|
|
auto it = buffer_sizes_.find(name);
|
|
|
|
|
PADDLE_ENFORCE(it != buffer_sizes_.end());
|
|
|
|
|
PADDLE_ENFORCE_GT(it->second, 0);
|
|
|
|
|
PADDLE_ENFORCE_GE(max_size, it->second);
|
|
|
|
|
VLOG(4) << "get output in cpu";
|
|
|
|
|
auto &buf = buffer(name);
|
|
|
|
|
|
|
|
|
|
// Update needed buffer size.
|
|
|
|
|
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
|
|
|
|
|
auto dims = infer_engine_->getBindingDimensions(slot_offset);
|
|
|
|
|
buf.size = kDataTypeSize[static_cast<int>(
|
|
|
|
|
infer_engine_->getBindingDataType(slot_offset))] *
|
|
|
|
|
analysis::AccuDims(dims.d, dims.nbDims);
|
|
|
|
|
PADDLE_ENFORCE_LE(buf.size, buf.max_size);
|
|
|
|
|
// determine data size
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
|
|
|
|
|
cudaMemcpyDeviceToHost, *stream_));
|
|
|
|
|
// DEBUG
|
|
|
|
|
memset(dst, 0, buf.size);
|
|
|
|
|
PADDLE_ENFORCE_EQ(
|
|
|
|
|
0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Buffer &TensorRTEngine::buffer(const std::string &name) {
|
|
|
|
@ -178,8 +192,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
|
|
|
|
|
size_t size) {
|
|
|
|
|
auto &buf = buffer(name);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(data);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(stream_);
|
|
|
|
|
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
|
|
|
|
|
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
|
|
|
|
|
buf.size = size;
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
|
|
|
|
|
cudaMemcpyHostToDevice, *stream_));
|
|
|
|
|
}
|
|
|
|
@ -187,6 +204,7 @@ void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
|
|
|
|
|
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
|
|
|
|
|
size_t size) {
|
|
|
|
|
auto &buf = buffer(name);
|
|
|
|
|
buf.size = size;
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
|
|
|
|
|
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
|
|
|
|
|
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
|
|
|
|
|