From 6ed2ab29769354c4674e8909d8d03d500f1617b3 Mon Sep 17 00:00:00 2001 From: louei5 Date: Thu, 4 Mar 2021 15:29:13 +0800 Subject: [PATCH] fix bugs in recording gpu memory information by RDR --- .../ccsrc/debug/rdr/mem_address_recorder.cc | 19 ++++++++++++++++--- .../ccsrc/debug/rdr/mem_address_recorder.h | 4 +++- .../ccsrc/debug/rdr/running_data_recorder.cc | 4 +++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc b/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc index 99d7f4cb38..67ad103cbf 100644 --- a/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc +++ b/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc @@ -25,7 +25,9 @@ std::string MemInfo2String(const std::string &label, const AddressPtrList &info) std::ostringstream ss; ss << label << " " << info.size() << std::endl; for (size_t i = 0; i < info.size(); i++) { - ss << "&" << info[i]->addr << " #" << info[i]->size << std::endl; + if (info[i] != nullptr) { + ss << "&" << info[i]->addr << " #" << info[i]->size << std::endl; + } } return ss.str(); } @@ -46,7 +48,15 @@ void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMemInf auto outputs = mem_info.outputs_; mem_info_stream << MemInfo2String("kernel_outputs", *outputs); mem_info_stream << std::endl; - mem_info_str_ += mem_info_stream.str(); + std::string mem_info_str = mem_info_stream.str(); + size_t length = mem_info_append_str_.size() + mem_info_str.size(); + // set maximum length of one memory info recorder is 10 percent of string::max_size + if (length < 0.1 * mem_info_str.max_size()) { + mem_info_append_str_ += mem_info_str; + } else { + mem_infos_.push_back(mem_info_append_str_); + mem_info_append_str_ = mem_info_str; + } } void MemAddressRecorder::Export() { @@ -62,7 +72,10 @@ void MemAddressRecorder::Export() { MS_LOG(WARNING) << "Open file for saving gpu memory information failed. File path: '" << file_path << "'."; return; } - fout << mem_info_str_; + for (auto &info : mem_infos_) { + fout << info; + } + fout << mem_info_append_str_; fout.close(); ChangeFileMode(file_path, S_IRUSR); } diff --git a/mindspore/ccsrc/debug/rdr/mem_address_recorder.h b/mindspore/ccsrc/debug/rdr/mem_address_recorder.h index f301d4edc3..1b8261307c 100644 --- a/mindspore/ccsrc/debug/rdr/mem_address_recorder.h +++ b/mindspore/ccsrc/debug/rdr/mem_address_recorder.h @@ -46,7 +46,9 @@ class MemAddressRecorder : public BaseRecorder { MemAddressRecorder &operator=(const MemAddressRecorder &recorder); mutable std::mutex mtx_; - std::string mem_info_str_; + + std::string mem_info_append_str_; + std::vector mem_infos_; }; using MemAddressRecorderPtr = std::shared_ptr; } // namespace mindspore diff --git a/mindspore/ccsrc/debug/rdr/running_data_recorder.cc b/mindspore/ccsrc/debug/rdr/running_data_recorder.cc index 4a14a2303c..45e4c32663 100644 --- a/mindspore/ccsrc/debug/rdr/running_data_recorder.cc +++ b/mindspore/ccsrc/debug/rdr/running_data_recorder.cc @@ -131,8 +131,10 @@ bool RecordMemAddressInfo(const SubModuleId module, const std::string &tag, cons return false; } std::string submodule_name = std::string(GetSubModuleName(module)); + std::string directory = mindspore::EnvConfigParser::GetInstance().rdr_path(); MemAddressRecorder::Instance().SetModule(submodule_name); - MemAddressRecorder::Instance().SetTag(tag); + MemAddressRecorder::Instance().SetFilename(tag); // set filename using tag + MemAddressRecorder::Instance().SetDirectory(directory); MemAddressRecorder::Instance().SaveMemInfo(op_name, mem_info); return true; }