FIx C++ inference BUG: When open memory optim and enable trt subgraph at the same time, there is a bug (#19969)

* fix memory optimization type test=develop * 1. fix BUG: open trt and memory optim will trigger bug. 2. Clean memory optim bug. test=develop
6 years ago · e89b12884a
parent 4286a6270d
commit e89b12884a
9 changed files with 51 additions and 752 deletions
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@ -196,9 +196,7 @@ struct Argument {
  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
-  DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
+
  DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
                      StaticMemoryOptimForceUpdate, bool);
  // Indicate which kind of sort algorithm is used for operators, the memory
  // optimization relays on the sort algorithm.
  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@ -41,7 +41,8 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
  };
  SubGraphFuser fuser(graph, teller,
-                      Get<int>("min_subgraph_size") /*min subgraph size*/);
+                      Get<int>("min_subgraph_size") /*min subgraph size*/,
                      "tensorrt_engine");
  fuser();
  std::vector<std::string> graph_param_names =
@ -200,13 +201,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
  op_desc->SetBlockAttr("sub_block", new_block);
-  SetAttr(op_desc->Proto(), "subgraph",
+  op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
-          block_desc.Proto()->SerializeAsString());
+  op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
-  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
+  op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
+  op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
-  SetAttr(op_desc->Proto(), "gpu_id", Get<int>("gpu_device_id"));
+  op_desc->SetAttr("output_name_mapping", output_mapping);
-  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  op_desc->SetAttr("parameters", params);
  SetAttr(op_desc->Proto(), "parameters", params);
  // we record all inputs' shapes in attr to check if they are consistent
  // with the real inputs' shapes retrieved from scope when trt runs.
@ -232,16 +232,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
    calibration_data = GetTrtCalibTableData(
        Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
  }
-  SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
+  op_desc->SetAttr("calibration_data", calibration_data);
  op_desc->SetAttr("enable_int8", enable_int8);
  op_desc->SetAttr("enable_fp16", enable_fp16);
  op_desc->SetAttr("use_calib_mode", use_calib_mode);
  op_desc->SetAttr("engine_key", engine_key);
  op_desc->SetAttr("predictor_id", predictor_id);
  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "enable_fp16", enable_fp16);
  SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
  SetAttr(op_desc->Proto(), "predictor_id", predictor_id);
  std::string trt_engine_serialized_data = "";
-  SetAttr(op_desc->Proto(), "engine_serialized_data",
+  op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data);
-          trt_engine_serialized_data);
+  op_desc->Flush();
  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
  if (enable_int8 && calibration_data.size() != 0) {
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@ -25,45 +25,22 @@ namespace paddle {
 namespace inference {
 namespace analysis {
-/*
+/* Memory optimization.
- * Memory optimization pass for inference with pre-analysis of memory usage
+* We will perform the following operation:
- * without GC.
+* 1. Collect all var's lifetime.
- * Different from training, the inference memory reuse strategies doesn't
+* 2. Make reuse plan: the vars can be reused if there is no overlap(on lifetime)
- * include GC for that overhead is too much when batch size equals one.
+* between
- *
+* them.
- * The inference memory reuse tries to pre-determine the tensor reusing strategy
+* The final plan is a mapping table in which the key represents the original
- * without runtime overhead.
+* name of var and the value in the table represents the current name of var.
- *
+* 3. Perform reuse plan: Replace all var's name in the model according to the
- * To improve the strategy's performance, a warm-up running is introduced:
+* mapping table.
- *   - Before officially deploy the inference program, one should warm it up and
+*/
 *     generate some runtime cache,
 *   - Run the inference program with several batches of data, it will persist
 *     some runtime information about memory of tensors to disk, we call the
 *     information the memory reusing cache,
 *   - With the memory reusing cache, user can deploy the inference to a
 *     service, before running the model, the inference program will load the
 *     memory cache, analysis it and generate the best memory reusing strategy,
 *     and adjust the execution of the network.
 *
 * With the warm-up and memory reusing cache design, the memory reusing
 * algorithm can analysis the real memory consume of the tensors, even with the
 * flexible LoDTensor and special shape changing operators such as
 * sequence-pooling.
 */
 class MemoryOptimizePass : public AnalysisPass {
 public:
  using space_table_t = std::unordered_map<std::string, size_t>;
  using lifecycle_t = std::pair<int, int>;
  struct MemoryAllocation {
    size_t allocated;  // allocated memory in byte.
    size_t saved;      // saved memory in byte.
    int sort_kind;     // the kind of the corresponding sorting algorithm.
    // Get the memory saving ratio of temporary variables.
    float GetSavingRatio() const;
  };
  virtual ~MemoryOptimizePass() = default;
 protected:
@ -75,24 +52,6 @@ class MemoryOptimizePass : public AnalysisPass {
      int sort_kind) const;
  void CollectVarMemorySize(space_table_t *space_table) const;
  void CollectVarMemorySize0(space_table_t *space_table) const;
  void CollectVarMemorySize(
      const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
      std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
      space_table_t *space_table) const;
  // Returns percentage of saved memory.
  void MakeReusePlan(
      const std::vector<std::unordered_set<std::string>> &var_clusters,
      const std::unordered_map<std::string, size_t> &var_batch_ave_size,
      const space_table_t &space_table,
      std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
      MemoryAllocation *memory_allocation) const;
  void PerformReusePlan(
      const std::unordered_map<std::string, std::string> &reuse_table,
      int sort_kind, std::unordered_set<std::string> *vars2remove) const;
 public:
  std::string repr() const override;
@ -102,12 +61,6 @@ class MemoryOptimizePass : public AnalysisPass {
  mutable int max_lifecycle_{-1};
 };
 static std::string GetMemoryCachePath(const std::string &model_path,
                                      const std::string &prog_path) {
  auto path = model_path.empty() ? prog_path : model_path;
  return path + ".memory_cache";
 }
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@ -101,8 +101,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(memory_pool_init_size_mb_);
  CP_MEMBER(enable_memory_optim_);
  CP_MEMBER(static_memory_optim_);
  CP_MEMBER(static_memory_optim_force_update_);
  // TensorRT related.
  CP_MEMBER(use_tensorrt_);
  CP_MEMBER(tensorrt_workspace_size_);
@ -371,8 +369,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << tensorrt_min_subgraph_size_;
  ss << enable_memory_optim_;
  ss << static_memory_optim_;
  ss << static_memory_optim_force_update_;
  ss << use_ngraph_;
@ -420,12 +416,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
-void AnalysisConfig::EnableMemoryOptim(bool static_optim,
+void AnalysisConfig::EnableMemoryOptim() {
                                       bool force_update_static_cache) {
  enable_memory_optim_ = true;
  static_memory_optim_ = static_optim;
  static_memory_optim_force_update_ = force_update_static_cache;
  Update();
 }
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -241,11 +241,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
    return false;
  }
  // Collect variable shapes for memory optimization.
  if (need_collect_var_shapes_for_memory_optim()) {
    CollectVarShapes();
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
  // All the containers in the scope will be hold in inference, but the
@ -390,9 +385,6 @@ void AnalysisPredictor::PrepareArgument() {
  argument_.SetGPUDeviceId(config_.gpu_device_id());
  argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
  argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
  argument_.SetStaticMemoryOptimForceUpdate(
      config_.static_memory_optim_force_update_);
  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  argument_.SetUseAnakin(config_.anakin_engine_enabled());
@ -818,13 +810,6 @@ AnalysisPredictor::~AnalysisPredictor() {
    mkldnn_quantizer_ = nullptr;
  }
 #endif
  // TODO(Superjomn) deduce the directory path.
  std::string out_path = inference::analysis::GetMemoryCachePath(
      config_.model_dir(), config_.prog_file());
  if (need_collect_var_shapes_for_memory_optim()) {
    SerializeBatchVarShapes(out_path);
  }
 }
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@ -834,66 +819,6 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
  return std::unique_ptr<PaddlePredictor>(x);
 }
 void AnalysisPredictor::CollectVarShapes() {
  VLOG(4) << "Collecting var shapes";
  if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
  std::map<std::string, std::vector<int>> var_shapes;
  for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
    auto *var = sub_scope_->FindVar(var_name);
    PADDLE_ENFORCE_NOT_NULL(var);
    if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
        var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
      auto &tensor = var->Get<framework::LoDTensor>();
      auto shape = framework::vectorize(tensor.dims());
      var_shapes[var_name].assign(shape.begin(), shape.end());
    }
  }
  batch_var_shapes_.push_back(var_shapes);
  LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
                       << " batch of var shapes for analysis";
 }
 void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
  LOG(INFO) << "serialize batch var shapes to " << path;
  std::ofstream file(path);
  if (!file.is_open()) {
    LOG(ERROR) << "failed to serialize the var shapes to " << path;
    return;
  }
  // The sirialized data format:
  // <tensor_name>:dim0,dim1,dim2,;
  for (auto &batch : batch_var_shapes_) {
    for (auto &ele : batch) {
      file << ele.first << ":";
      for (size_t i = 0; i < ele.second.size() - 1; i++) {
        file << ele.second[i] << ",";
      }
      file << ele.second.back() << ";";
    }
    file << "\n";
  }
 }
 bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
  if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
  bool need = false;
  // check if the cache exists
  if (!config_.enable_memory_optim()) {
    need = false;
  } else if (config_.static_memory_optim_ &&
             !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
                 config_.model_dir(), config_.prog_file()))) {
    need = true;
  } else if (config_.static_memory_optim_ &&
             config_.static_memory_optim_force_update_) {
    need = true;
  }
  need_collect_var_shapes_ = need ? 1 : 0;
  return need;
 }
 std::string AnalysisPredictor::GetSerializedProgram() const {
  return inference_program_->Proto()->SerializeAsString();
 }
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@ -91,11 +91,6 @@ class AnalysisPredictor : public PaddlePredictor {
  void SaveOptimModel(const std::string &dir);
 protected:
  // For memory optimization.
  bool need_collect_var_shapes_for_memory_optim();
  void CollectVarShapes();
  void SerializeBatchVarShapes(const std::string &path);
  bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
  bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
  bool CreateExecutor();
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@ -244,8 +244,7 @@ struct AnalysisConfig {
  /** Turn on memory optimize
   * NOTE still in development, will release latter.
   */
-  void EnableMemoryOptim(bool static_optim = false,
+  void EnableMemoryOptim();
                         bool force_update_static_cache = false);
  /** Tell whether the memory optimization is activated. */
  bool enable_memory_optim() const;
@ -309,8 +308,6 @@ struct AnalysisConfig {
  // memory reuse related.
  bool enable_memory_optim_{false};
  bool static_memory_optim_{false};
  bool static_memory_optim_force_update_{false};
  bool use_ngraph_{false};
  bool use_mkldnn_{false};
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@ -262,33 +262,6 @@ void compare(bool use_mkldnn = false) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 // Compare result of NativeConfig and AnalysisConfig with memory optimization.
 TEST(Analyzer_dam, compare_with_static_memory_optim) {
  // The small dam will core in CI, but works in local.
  if (FLAGS_max_turn_num == 9) {
    AnalysisConfig cfg, cfg1;
    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
    std::vector<std::vector<PaddleTensor>> input_slots_all;
    SetInput(&input_slots_all);
    // Run the first time to force to update memory cache
    SetConfig(&cfg);
    cfg.EnableMemoryOptim(true, true /*force update*/);
    CompareNativeAndAnalysis(
        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
        input_slots_all);
    // Run second time to use the memory cache and perform memory optimization.
    SetConfig(&cfg1);
    cfg1.EnableMemoryOptim(true, false /*do not force update*/);
    CompareNativeAndAnalysis(
        reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
        input_slots_all);
  }
 }
 TEST(Analyzer_dam, compare_with_dynamic_memory_optim) {
  // The small dam will core in CI, but works in local.
  if (FLAGS_max_turn_num == 9) {