|
|
|
@ -26,6 +26,8 @@ namespace paddle {
|
|
|
|
|
namespace inference {
|
|
|
|
|
namespace tensorrt {
|
|
|
|
|
|
|
|
|
|
int TensorRTEngine::runtime_batch_ = 1;
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::Build(const DescType& paddle_model) {
|
|
|
|
|
PADDLE_ENFORCE(false, "not implemented");
|
|
|
|
|
}
|
|
|
|
@ -40,6 +42,7 @@ void TensorRTEngine::Execute(int batch_size) {
|
|
|
|
|
}
|
|
|
|
|
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
|
|
|
|
|
cudaStreamSynchronize(*stream_);
|
|
|
|
|
SetRuntimeBatch(batch_size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TensorRTEngine::~TensorRTEngine() {
|
|
|
|
@ -76,14 +79,15 @@ void TensorRTEngine::FreezeNetwork() {
|
|
|
|
|
auto dims = infer_engine_->getBindingDimensions(slot_offset);
|
|
|
|
|
item.second = kDataTypeSize[static_cast<int>(
|
|
|
|
|
infer_engine_->getBindingDataType(slot_offset))] *
|
|
|
|
|
analysis::AccuDims(dims.d, dims.nbDims);
|
|
|
|
|
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
|
|
|
|
|
}
|
|
|
|
|
auto& buf = buffer(item.first);
|
|
|
|
|
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
|
|
|
|
|
VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
|
|
|
|
|
<< buf.buffer;
|
|
|
|
|
buf.size = buf.max_size = item.second;
|
|
|
|
|
buf.size = item.second;
|
|
|
|
|
buf.max_size = item.second * max_batch_;
|
|
|
|
|
buf.device = DeviceType::GPU;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -98,7 +102,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
|
|
|
|
|
auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
|
|
|
|
|
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
|
|
|
|
|
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
|
|
|
|
|
analysis::AccuDims(dims.d, dims.nbDims);
|
|
|
|
|
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
|
|
|
|
|
PADDLE_ENFORCE(input->isNetworkInput());
|
|
|
|
|
TensorRTEngine::SetITensor(name, input);
|
|
|
|
|
return input;
|
|
|
|
@ -139,30 +143,40 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
|
|
|
|
|
return buffer(name).buffer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
|
|
|
|
|
size_t max_size) {
|
|
|
|
|
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
|
|
|
|
|
// determine data size
|
|
|
|
|
auto* output = TensorRTEngine::GetITensor(name);
|
|
|
|
|
nvinfer1::Dims dims = output->getDimensions();
|
|
|
|
|
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
|
|
|
|
|
size_t dst_size = dim_size * runtime_batch_ *
|
|
|
|
|
kDataTypeSize[static_cast<int>(output->getType())];
|
|
|
|
|
|
|
|
|
|
auto it = buffer_sizes_.find(name);
|
|
|
|
|
PADDLE_ENFORCE(it != buffer_sizes_.end());
|
|
|
|
|
PADDLE_ENFORCE_GT(it->second, 0);
|
|
|
|
|
PADDLE_ENFORCE_GE(max_size, it->second);
|
|
|
|
|
PADDLE_ENFORCE_LE(dst_size, it->second);
|
|
|
|
|
auto& buf = buffer(name);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
|
|
|
|
|
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
|
|
|
|
|
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
|
|
|
|
|
cudaMemcpyDeviceToDevice, *stream_),
|
|
|
|
|
0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
|
|
|
|
|
size_t max_size) {
|
|
|
|
|
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
|
|
|
|
|
// determine data size
|
|
|
|
|
|
|
|
|
|
auto* output = TensorRTEngine::GetITensor(name);
|
|
|
|
|
nvinfer1::Dims dims = output->getDimensions();
|
|
|
|
|
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
|
|
|
|
|
size_t dst_size = dim_size * runtime_batch_ *
|
|
|
|
|
kDataTypeSize[static_cast<int>(output->getType())];
|
|
|
|
|
auto it = buffer_sizes_.find(name);
|
|
|
|
|
PADDLE_ENFORCE(it != buffer_sizes_.end());
|
|
|
|
|
PADDLE_ENFORCE_GT(it->second, 0);
|
|
|
|
|
PADDLE_ENFORCE_GE(max_size, it->second);
|
|
|
|
|
PADDLE_ENFORCE_LE(dst_size, it->second);
|
|
|
|
|
auto& buf = buffer(name);
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
|
|
|
|
|
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
|
|
|
|
|
cudaMemcpyDeviceToHost, *stream_));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -207,6 +221,12 @@ nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
|
|
|
|
|
return itensor_map_[name];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
|
|
|
|
|
runtime_batch_ = batch_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
|
|
|
|
|
|
|
|
|
|
} // namespace tensorrt
|
|
|
|
|
} // namespace inference
|
|
|
|
|
} // namespace paddle
|
|
|
|
|