[Fix BUG]: Core when multi thread + clone + paddle-trt (#22442)

* add mutex for trt engine test=develop * add the test for copy_to_cpu test=develop
5 years ago · ceda0b9b1a
parent 30320b336e
commit ceda0b9b1a
4 changed files with 15 additions and 3 deletions
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@ -138,7 +138,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
                 t_data, ele_num * sizeof(T), dev_ctx->stream());
-    cudaDeviceSynchronize();
+
    cudaStreamSynchronize(dev_ctx->stream());
 #else
    PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@ -38,13 +38,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
  const std::thread::id tid = std::this_thread::get_id();
  batch_size_ = batch_size;
  if (infer_context_.find(tid) == infer_context_.end()) {
    std::unique_lock<std::mutex> lock(mutex_);
    PADDLE_ENFORCE_NOT_NULL(
        infer_engine_,
        "You should build engine first and then set the context.");
    infer_context_[tid].reset(infer_engine_->createExecutionContext());
  }
  infer_context_[tid]->enqueue(batch_size, buffers->data(), stream, nullptr);
  cudaStreamSynchronize(stream);
  SetRuntimeBatch(batch_size);
 }
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -82,7 +82,7 @@ class TensorRTEngine {
  void Build(const DescType& paddle_model);
  void Execute(int batch_size, std::vector<void*>* buffers,
-               cudaStream_t stream);
+               cudaStream_t stream = nullptr);
  // Initialize the inference network, so that TensorRT layers can add to this
  // network.
@ -216,6 +216,7 @@ class TensorRTEngine {
      infer_context_;
  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
  std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
  std::mutex mutex_;
 };  // class TensorRTEngine
 #define IS_TRT_VERSION_GE(version)                       \
--- a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
@ -15,6 +15,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <numeric>
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
@ -44,6 +45,15 @@ TEST(quant_int8, resnet50) {
  input_t->copy_from_cpu(input);
  ASSERT_TRUE(predictor->ZeroCopyRun());
  std::vector<float> out_data;
  auto output_names = predictor->GetOutputNames();
  auto output_t = predictor->GetOutputTensor(output_names[0]);
  std::vector<int> output_shape = output_t->shape();
  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
                                std::multiplies<int>());
  out_data.resize(out_num);
  output_t->copy_to_cpu(out_data.data());
 }
 }  // namespace inference