Paddle/paddle/fluid/inference/tensorrt/engine.cc

/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/inference/tensorrt/engine.h"

#include <NvInfer.h>
#include <cuda.h>
#include <glog/logging.h>
#include <string>
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace inference {
namespace tensorrt {

int TensorRTEngine::runtime_batch_ = 1;

void TensorRTEngine::Build(const DescType &paddle_model) {
  PADDLE_ENFORCE(false, "not implemented");
}

void TensorRTEngine::Execute(int batch_size) {
  freshDeviceId();
  batch_size_ = batch_size;
  std::vector<void *> buffers;
  for (auto &buf : buffers_) {
    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
    PADDLE_ENFORCE_GT(buf.max_size, 0);
    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
    buffers.push_back(buf.buffer);
  }
  PADDLE_ENFORCE_NOT_NULL(stream_);
  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
  cudaStreamSynchronize(*stream_);
  SetRuntimeBatch(batch_size);
}

TensorRTEngine::~TensorRTEngine() {
  cudaStreamSynchronize(*stream_);
  // clean buffer
  for (auto &buf : buffers_) {
    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
      buf.buffer = nullptr;
      buf.max_size = 0;
    }
  }
}

void TensorRTEngine::FreezeNetwork() {
  VLOG(3) << "TRT to freeze network";
  freshDeviceId();
  PADDLE_ENFORCE(infer_builder_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  PADDLE_ENFORCE(infer_network_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  // build engine.
  infer_builder_->setMaxBatchSize(max_batch_);
  infer_builder_->setMaxWorkspaceSize(max_workspace_);

  infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");

  infer_context_.reset(infer_engine_->createExecutionContext());

  // allocate GPU buffers.
  buffers_.resize(buffer_sizes_.size());
  for (auto &item : buffer_sizes_) {
    // The output buffers are not set in the network building phrase, need to
    // infer from the TesorRT network.
    if (item.second == 0) {
      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
      auto dims = infer_engine_->getBindingDimensions(slot_offset);
      item.second = kDataTypeSize[static_cast<int>(
                        infer_engine_->getBindingDataType(slot_offset))] *
                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
      PADDLE_ENFORCE_GT(item.second, 0);
    }

    auto &buf = buffer(item.first);
    buf.max_size = item.second * max_batch_;
    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.

    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
    buf.size = 0;
    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
    buf.device = DeviceType::GPU;
  }
}

nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                nvinfer1::DataType dtype,
                                                const nvinfer1::Dims &dims) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                    name);

  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
  PADDLE_ENFORCE(input->isNetworkInput());
  TensorRTEngine::SetITensor(name, input);
  return input;
}

void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
                                   const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

  auto *output = layer->getOutput(offset);
  SetITensor(name, output);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
  PADDLE_ENFORCE(!output->isNetworkInput());
  infer_network_->markOutput(*output);
  PADDLE_ENFORCE(output->isNetworkOutput());
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.
  buffer_sizes_[name] = 0;
}

bool TensorRTEngine::HasDeclared(const std::string &name) {
  return buffer_sizes_.count(name) > 0;
}

void TensorRTEngine::DeclareOutput(const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

  auto *output = TensorRTEngine::GetITensor(name);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
  PADDLE_ENFORCE(!output->isNetworkInput());
  infer_network_->markOutput(*output);
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.
  buffer_sizes_[name] = 0;
}

void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
  return buffer(name).buffer;
}

void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                    size_t max_size) {
  // determine data size
  auto *output = TensorRTEngine::GetITensor(name);
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
                    kDataTypeSize[static_cast<int>(output->getType())];

  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
  PADDLE_ENFORCE_LE(dst_size, it->second);
  PADDLE_ENFORCE_GE(max_size, dst_size);
  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                    cudaMemcpyDeviceToDevice, *stream_),
                    0);
}

void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                    size_t max_size) {
  // determine data size

  auto *output = TensorRTEngine::GetITensor(name);
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
                    kDataTypeSize[static_cast<int>(output->getType())];
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
  PADDLE_ENFORCE_LE(dst_size, it->second);
  PADDLE_ENFORCE_GE(max_size, dst_size);
  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                       cudaMemcpyDeviceToHost, *stream_));
}

Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
                 name);
  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
  return buffers_[slot_offset];
}

void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                     size_t size) {
  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
  PADDLE_ENFORCE_NOT_NULL(data);
  PADDLE_ENFORCE_NOT_NULL(stream_);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
  buf.size = size;
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyHostToDevice, *stream_));
}

void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                     size_t size) {
  auto &buf = buffer(name);
  buf.size = size;
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyDeviceToDevice, *stream_));
}

void TensorRTEngine::SetITensor(const std::string &name,
                                nvinfer1::ITensor *tensor) {
  PADDLE_ENFORCE(tensor != nullptr);
  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                    name);
  itensor_map_[name] = tensor;
}

nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
  return itensor_map_[name];
}

void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
  runtime_batch_ = batch_size;
}

int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }

void TensorRTEngine::freshDeviceId() {
  int count;
  cudaGetDeviceCount(&count);
  PADDLE_ENFORCE_LT(device_, count);
  cudaSetDevice(device_);
}

nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
  owned_plugin_.emplace_back(plugin);
  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
}

}  // namespace tensorrt
}  // namespace inference
}  // namespace paddle
fea/init tensorrt engine (#10003) 7 years ago			`/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.`

add assert for GetOutput 7 years ago			`Licensed under the Apache License, Version 2.0 (the "License"); you may not use`
			`this file except in compliance with the License.`
fea/init tensorrt engine (#10003) 7 years ago			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`

			`#include "paddle/fluid/inference/tensorrt/engine.h"`

			`#include <NvInfer.h>`
			`#include <cuda.h>`
			`#include <glog/logging.h>`
Fix more CPPLint errors (#10218) * Fix more CPPLint issues * Fix more CPPLint issues * Fix more CPPLint issues * Fix CPPLint issues in operators/math and operators/reader 7 years ago			`#include <string>`
feature/mul converter (#10841) 7 years ago			`#include "paddle/fluid/inference/analysis/helper.h"`
fea/init tensorrt engine (#10003) 7 years ago			`#include "paddle/fluid/inference/tensorrt/helper.h"`
			`#include "paddle/fluid/platform/enforce.h"`

			`namespace paddle {`
			`namespace inference {`
			`namespace tensorrt {`

there is no batchsize concept in tensorrt's tensor 7 years ago			`int TensorRTEngine::runtime_batch_ = 1;`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`void TensorRTEngine::Build(const DescType &paddle_model) {`
fea/init tensorrt engine (#10003) 7 years ago			`PADDLE_ENFORCE(false, "not implemented");`
			`}`

			`void TensorRTEngine::Execute(int batch_size) {`
1. change tensorrt op from cpu to gpu 7 years ago			`freshDeviceId();`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`batch_size_ = batch_size;`
			`std::vector<void *> buffers;`
			`for (auto &buf : buffers_) {`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");`
			`PADDLE_ENFORCE_GT(buf.max_size, 0);`
			`PADDLE_ENFORCE(buf.device == DeviceType::GPU);`
			`buffers.push_back(buf.buffer);`
			`}`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`PADDLE_ENFORCE_NOT_NULL(stream_);`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);`
fea/init tensorrt engine (#10003) 7 years ago			`cudaStreamSynchronize(*stream_);`
there is no batchsize concept in tensorrt's tensor 7 years ago			`SetRuntimeBatch(batch_size);`
fea/init tensorrt engine (#10003) 7 years ago			`}`

			`TensorRTEngine::~TensorRTEngine() {`
feature/trt engine op test (#11182) 7 years ago			`cudaStreamSynchronize(*stream_);`
fea/init tensorrt engine (#10003) 7 years ago			`// clean buffer`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`for (auto &buf : buffers_) {`
feature/trt engine op test (#11182) 7 years ago			`if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));`
			`buf.buffer = nullptr;`
			`buf.max_size = 0;`
fea/init tensorrt engine (#10003) 7 years ago			`}`
			`}`
			`}`

			`void TensorRTEngine::FreezeNetwork() {`
Combine Inference Analysis with IR (#13914) 6 years ago			`VLOG(3) << "TRT to freeze network";`
1. change tensorrt op from cpu to gpu 7 years ago			`freshDeviceId();`
fea/init tensorrt engine (#10003) 7 years ago			`PADDLE_ENFORCE(infer_builder_ != nullptr,`
			`"Call InitNetwork first to initialize network.");`
			`PADDLE_ENFORCE(infer_network_ != nullptr,`
			`"Call InitNetwork first to initialize network.");`
			`// build engine.`
			`infer_builder_->setMaxBatchSize(max_batch_);`
			`infer_builder_->setMaxWorkspaceSize(max_workspace_);`

			`infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));`
			`PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");`

			`infer_context_.reset(infer_engine_->createExecutionContext());`

			`// allocate GPU buffers.`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`buffers_.resize(buffer_sizes_.size());`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`for (auto &item : buffer_sizes_) {`
			`// The output buffers are not set in the network building phrase, need to`
			`// infer from the TesorRT network.`
fea/init tensorrt engine (#10003) 7 years ago			`if (item.second == 0) {`
			`auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());`
feature/mul converter (#10841) 7 years ago			`auto dims = infer_engine_->getBindingDimensions(slot_offset);`
fea/init tensorrt engine (#10003) 7 years ago			`item.second = kDataTypeSize[static_cast<int>(`
			`infer_engine_->getBindingDataType(slot_offset))] *`
there is no batchsize concept in tensorrt's tensor 7 years ago			`analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`PADDLE_ENFORCE_GT(item.second, 0);`
fea/init tensorrt engine (#10003) 7 years ago			`}`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago
			`auto &buf = buffer(item.first);`
			`buf.max_size = item.second * max_batch_;`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`CHECK(buf.buffer == nullptr); // buffer should be allocated only once.`
deal with conflict 7 years ago
there is no batchsize concept in tensorrt's tensor 7 years ago			`PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`buf.size = 0;`
deal with conflict 7 years ago			`PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`buf.device = DeviceType::GPU;`
fea/init tensorrt engine (#10003) 7 years ago			`}`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,`
fea/init tensorrt engine (#10003) 7 years ago			`nvinfer1::DataType dtype,`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`const nvinfer1::Dims &dims) {`
fea/init tensorrt engine (#10003) 7 years ago			`PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",`
			`name);`

			`PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`auto *input = infer_network_->addInput(name.c_str(), dtype, dims);`
fea/init tensorrt engine (#10003) 7 years ago			`PADDLE_ENFORCE(input, "infer network add input %s failed", name);`
feature/mul converter (#10841) 7 years ago			`buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *`
there is no batchsize concept in tensorrt's tensor 7 years ago			`analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;`
feature/trt engine op test (#11182) 7 years ago			`PADDLE_ENFORCE(input->isNetworkInput());`
add relu converter and unit-test 7 years ago			`TensorRTEngine::SetITensor(name, input);`
fea/init tensorrt engine (#10003) 7 years ago			`return input;`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,`
			`const std::string &name) {`
fea/init tensorrt engine (#10003) 7 years ago			`PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",`
			`name);`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`auto *output = layer->getOutput(offset);`
feature/add TRT fc converter (#11043) 7 years ago			`SetITensor(name, output);`
fea/init tensorrt engine (#10003) 7 years ago			`PADDLE_ENFORCE(output != nullptr);`
			`output->setName(name.c_str());`
feature/trt engine op test (#11182) 7 years ago			`PADDLE_ENFORCE(!output->isNetworkInput());`
fea/init tensorrt engine (#10003) 7 years ago			`infer_network_->markOutput(*output);`
feature/trt engine op test (#11182) 7 years ago			`PADDLE_ENFORCE(output->isNetworkOutput());`
fea/init tensorrt engine (#10003) 7 years ago			`// output buffers' size can only be decided latter, set zero here to mark this`
			`// and will reset latter.`
			`buffer_sizes_[name] = 0;`
			`}`

fix comments and fix bug 6 years ago			`bool TensorRTEngine::HasDeclared(const std::string &name) {`
			`return buffer_sizes_.count(name) > 0;`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`void TensorRTEngine::DeclareOutput(const std::string &name) {`
add relu converter and unit-test 7 years ago			`PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",`
			`name);`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`auto *output = TensorRTEngine::GetITensor(name);`
add relu converter and unit-test 7 years ago			`PADDLE_ENFORCE(output != nullptr);`
			`output->setName(name.c_str());`
feature/trt engine op test (#11182) 7 years ago			`PADDLE_ENFORCE(!output->isNetworkInput());`
add relu converter and unit-test 7 years ago			`infer_network_->markOutput(*output);`
			`// output buffers' size can only be decided latter, set zero here to mark this`
			`// and will reset latter.`
			`buffer_sizes_[name] = 0;`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`void *TensorRTEngine::GetOutputInGPU(const std::string &name) {`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`return buffer(name).buffer;`
fea/init tensorrt engine (#10003) 7 years ago			`}`

add assert for GetOutput 7 years ago			`void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,`
			`size_t max_size) {`
feature/tensorrt engine op (#11001) 7 years ago			`// determine data size`
deal with conflict 7 years ago			`auto *output = TensorRTEngine::GetITensor(name);`
there is no batchsize concept in tensorrt's tensor 7 years ago			`nvinfer1::Dims dims = output->getDimensions();`
			`auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);`
			`size_t dst_size = dim_size * runtime_batch_ *`
			`kDataTypeSize[static_cast<int>(output->getType())];`

feature/tensorrt engine op (#11001) 7 years ago			`auto it = buffer_sizes_.find(name);`
			`PADDLE_ENFORCE(it != buffer_sizes_.end());`
			`PADDLE_ENFORCE_GT(it->second, 0);`
there is no batchsize concept in tensorrt's tensor 7 years ago			`PADDLE_ENFORCE_LE(dst_size, it->second);`
add assert for GetOutput 7 years ago			`PADDLE_ENFORCE_GE(max_size, dst_size);`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`auto &buf = buffer(name);`
feature/tensorrt engine op (#11001) 7 years ago			`PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");`
there is no batchsize concept in tensorrt's tensor 7 years ago			`PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,`
feature/tensorrt engine op (#11001) 7 years ago			`cudaMemcpyDeviceToDevice, *stream_),`
			`0);`
			`}`

add assert for GetOutput 7 years ago			`void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,`
			`size_t max_size) {`
fea/init tensorrt engine (#10003) 7 years ago			`// determine data size`
there is no batchsize concept in tensorrt's tensor 7 years ago
deal with conflict 7 years ago			`auto *output = TensorRTEngine::GetITensor(name);`
there is no batchsize concept in tensorrt's tensor 7 years ago			`nvinfer1::Dims dims = output->getDimensions();`
			`auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);`
			`size_t dst_size = dim_size * runtime_batch_ *`
			`kDataTypeSize[static_cast<int>(output->getType())];`
fea/init tensorrt engine (#10003) 7 years ago			`auto it = buffer_sizes_.find(name);`
			`PADDLE_ENFORCE(it != buffer_sizes_.end());`
			`PADDLE_ENFORCE_GT(it->second, 0);`
there is no batchsize concept in tensorrt's tensor 7 years ago			`PADDLE_ENFORCE_LE(dst_size, it->second);`
add assert for GetOutput 7 years ago			`PADDLE_ENFORCE_GE(max_size, dst_size);`
deal with conflict 7 years ago			`auto &buf = buffer(name);`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");`
there is no batchsize concept in tensorrt's tensor 7 years ago			`PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,`
fea/init tensorrt engine (#10003) 7 years ago			`cudaMemcpyDeviceToHost, *stream_));`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`Buffer &TensorRTEngine::buffer(const std::string &name) {`
fea/init tensorrt engine (#10003) 7 years ago			`PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");`
			`auto it = buffer_sizes_.find(name);`
Complete PRelu plugin and Conv2d transpose op converter 6 years ago			`PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",`
			`name);`
fea/init tensorrt engine (#10003) 7 years ago			`auto slot_offset = infer_engine_->getBindingIndex(name.c_str());`
			`return buffers_[slot_offset];`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,`
fea/init tensorrt engine (#10003) 7 years ago			`size_t size) {`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`auto &buf = buffer(name);`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`PADDLE_ENFORCE_NOT_NULL(buf.buffer);`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`PADDLE_ENFORCE_NOT_NULL(data);`
			`PADDLE_ENFORCE_NOT_NULL(stream_);`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");`
			`PADDLE_ENFORCE(buf.device == DeviceType::GPU);`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`buf.size = size;`
Feature/engine refactor (#10497) * init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update 7 years ago			`PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,`
			`cudaMemcpyHostToDevice, *stream_));`
fea/init tensorrt engine (#10003) 7 years ago			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,`
feature/tensorrt engine op (#11001) 7 years ago			`size_t size) {`
bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`auto &buf = buffer(name);`
			`buf.size = size;`
feature/tensorrt engine op (#11001) 7 years ago			`PADDLE_ENFORCE_NOT_NULL(buf.buffer);`
			`PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");`
			`PADDLE_ENFORCE(buf.device == DeviceType::GPU);`
			`PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,`
			`cudaMemcpyDeviceToDevice, *stream_));`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`void TensorRTEngine::SetITensor(const std::string &name,`
			`nvinfer1::ITensor *tensor) {`
add relu converter and unit-test 7 years ago			`PADDLE_ENFORCE(tensor != nullptr);`
feature/mul converter (#10841) 7 years ago			`PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",`
add relu converter and unit-test 7 years ago			`name);`
			`itensor_map_[name] = tensor;`
			`}`

bugfix/tensorrt analysis fix subgraph trigger (#12266) 7 years ago			`nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {`
feature/mul converter (#10841) 7 years ago			`PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);`
add relu converter and unit-test 7 years ago			`return itensor_map_[name];`
			`}`

there is no batchsize concept in tensorrt's tensor 7 years ago			`void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {`
			`runtime_batch_ = batch_size;`
			`}`

			`int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }`

1. change tensorrt op from cpu to gpu 7 years ago			`void TensorRTEngine::freshDeviceId() {`
			`int count;`
			`cudaGetDeviceCount(&count);`
			`PADDLE_ENFORCE_LT(device_, count);`
			`cudaSetDevice(device_);`
			`}`

fix comments test=develop 6 years ago			`nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(`
add plugin support and offer an simple split sample 6 years ago			`nvinfer1::ITensor const inputs, int nbInputs, PluginTensorRT *plugin) {`
			`owned_plugin_.emplace_back(plugin);`
			`return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);`
			`}`

fea/init tensorrt engine (#10003) 7 years ago			`} // namespace tensorrt`
			`} // namespace inference`
			`} // namespace paddle`