diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index 7079e432..e36b45d9 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -648,9 +648,11 @@ target_include_directories(ge_runner PRIVATE target_link_libraries(ge_runner $ + -Wl,--whole-archive + msprofiler_fwk + -Wl,--no-whole-archive ge_memory adump_server - msprofiler_fwk static_mmpa -Wl,--no-as-needed graph diff --git a/ge/common/profiling/ge_profiling.cc b/ge/common/profiling/ge_profiling.cc index 8da6b12a..bab699cc 100644 --- a/ge/common/profiling/ge_profiling.cc +++ b/ge/common/profiling/ge_profiling.cc @@ -125,8 +125,9 @@ ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { ge::Status RegProfReporterCallback(MsprofReporterCallback func) { if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) { - GELOGW("Msprof ctrl callback is exist, just ignore it."); + GELOGW("Msprof reporter callback is exist, just ignore it."); } else { + GELOGI("GE register Msprof reporter callback."); ge::ProfilingManager::Instance().SetMsprofReporterCallback(func); } // Pass MsprofReporterCallback to runtime diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc index bcf6d366..fd56f15d 100644 --- a/ge/common/profiling/profiling_manager.cc +++ b/ge/common/profiling/profiling_manager.cc @@ -24,16 +24,9 @@ #include "graph/load/new_model_manager/davinci_model.h" namespace { -const char *const kJobID = "jobID"; -const char *const kDeviceID = "deviceID"; -const char *const kStartCfg = "startCfg"; -const char *const kFeatures = "features"; -const char *const kConf = "conf"; -const char *const kEvents = "events"; -const char *const kAiCoreEvents = "ai_core_events"; -const char *const kName = "name"; -const char *const kTraceID = "traceId"; -const char *const kProfDir = "resultPath"; +const char *const kTrainingTrace = "training_trace"; +const char *const kFpPoint = "fp_point"; +const char *const kBpPoint = "bp_point"; const size_t kReportMaxLen = 2048; const int32_t kMaxDeviceNum = 256; const std::string kConfigNumsdev = "devNums"; @@ -70,7 +63,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In return ret; } - if (is_load_profiling_) { + if (is_execute_profiling_) { int32_t cb_ret = prof_cb_.msprofCtrlCallback( static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), static_cast(&prof_conf), sizeof(MsprofGeOptions)); @@ -91,37 +84,42 @@ ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOpt #ifdef DAVINCI_SUPPORT_PROFILING // enable profiling by env char env_profiling_mode[MMPA_MAX_PATH] = { 0x00 }; - is_load_profiling_ = false; + is_load_profiling_ = false; // Change in ProfInit is_execute_profiling_ = false; - (void)mmGetEnv("PROFILING_MODE", env_profiling_mode, MMPA_MAX_PATH); - (void)mmGetEnv("PROFILING_OPTIONS", prof_conf.options, sizeof(MsprofGeOptions)); - - if ((env_profiling_mode != nullptr) && (strcmp("true", env_profiling_mode) == 0) - && (strcmp(prof_conf.options, "\0") != 0)) { - // enable profiling by env - is_load_profiling_ = true; - is_execute_profiling_ = true; - GELOGI("The profiling in env is %s, %s", env_profiling_mode, prof_conf.options); - } else { - if (options.profiling_mode != "1" || options.profiling_options.empty()) { - return SUCCESS; - } + if (options.profiling_mode == "1" && !options.profiling_options.empty()) { // enable profiling by ge option - if (memcpy_s(prof_conf.options, sizeof(prof_conf.options), options.profiling_options.c_str(), + if (memcpy_s(prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX, options.profiling_options.c_str(), sizeof(options.profiling_options.c_str())) != EOK) { GELOGE(INTERNAL_ERROR, "copy profiling_options failed."); return INTERNAL_ERROR; } - is_load_profiling_ = true; is_execute_profiling_ = true; GELOGI("The profiling in options is %s, %s", options.profiling_mode.c_str(), prof_conf.options); + } else { + (void)mmGetEnv("PROFILING_MODE", env_profiling_mode, MMPA_MAX_PATH); + (void)mmGetEnv("PROFILING_OPTIONS", prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX); + // The env is invalid + if ((env_profiling_mode == nullptr) || (strcmp("true", env_profiling_mode) != 0) + || (strcmp(prof_conf.options, "\0") == 0)) { + return SUCCESS; + } + // enable profiling by env + is_execute_profiling_ = true; + GELOGI("The profiling in env is %s, %s", env_profiling_mode, prof_conf.options); } - if (!is_load_profiling_) { + if (!is_execute_profiling_) { return SUCCESS; } + // Parse json str for bp fp + Status ret = ParseOptions(prof_conf.options); + if (ret != ge::SUCCESS) { + GELOGE(ge::PARAM_INVALID, "Parse taining trace param failed."); + return ge::PARAM_INVALID; + } + if (memcpy_s(prof_conf.jobId, sizeof(prof_conf.jobId), options.job_id.c_str(), sizeof(options.job_id.c_str())) != EOK) { GELOGE(INTERNAL_ERROR, "copy job_id failed."); @@ -134,23 +132,55 @@ ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOpt return ge::SUCCESS; } +ge::Status ProfilingManager::ParseOptions(const std::string &options) { + if (options.empty()) { + GELOGE(ge::PARAM_INVALID, "Profiling options is empty.") + return ge::PARAM_INVALID; + } + try { + Json prof_options = Json::parse(options); + const std::string training_trace = prof_options[kTrainingTrace]; + if (training_trace.empty()) { + GELOGI("Training trace will not take effect."); + return ge::SUCCESS; + } + GELOGI("GE profiling training trace:%s", training_trace.c_str()); + if (training_trace != "on") { + GELOGE(ge::PARAM_INVALID, "Training trace param:%s is invalid.", training_trace.c_str()); + return ge::PARAM_INVALID; + } + fp_point = prof_options[kFpPoint]; + bp_point = prof_options[kBpPoint]; + if (!fp_point_.empty() && !bp_point_.empty()) { + GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str()); + } + } catch (...) { + GELOGE(FAILED, "Json prof_conf options is invalid."); + return ge::PARAM_INVALID; + } + return ge::SUCCESS; +} + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProfiling() { #ifdef DAVINCI_SUPPORT_PROFILING uint64_t module = GetProfilingModule(); + // The following if case will not be executed in normal case, inc case of ProfStopProfiling is abnormal int32_t device_num = static_cast(device_id_.size()); - auto device_id_ptr = std::unique_ptr(new (std::nothrow) uint32_t[device_num]); - if (device_id_ptr == nullptr) { - GELOGE(FAILED, "Stop profiling: device id ptr is null."); - return; - } - for (int32_t i = 0; i < device_num; i++) { - device_id_ptr[i] = static_cast(device_id_[i]); - } - rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get()); - if (rt_ret != RT_ERROR_NONE) { - GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret); + if (device_num != 0) { + auto device_id_ptr = std::unique_ptr(new (std::nothrow) uint32_t[device_num]); + if (device_id_ptr == nullptr) { + GELOGE(FAILED, "Stop profiling: device id ptr is null."); + return; + } + for (int32_t i = 0; i < device_num; i++) { + device_id_ptr[i] = static_cast(device_id_[i]); + } + rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get()); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret); + } } - + // stop profiling int32_t cb_ret = prof_cb_.msprofCtrlCallback(static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), nullptr, 0); @@ -475,6 +505,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfFi std::lock_guard lock(mutex_); is_load_profiling_ = false; is_training_trace_ = false; + is_execute_profiling_ = false; // profiling plugin uninit PluginUnInit(); @@ -714,7 +745,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::Profilin execute_model_prof_on = true; } GELOGI("Flag is_execute_profiling: %d, execute_model_prof_on: %d", is_execute_profiling_, execute_model_prof_on); - return is_execute_profiling_ || execute_model_prof_on; + return execute_model_prof_on; } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() const { @@ -744,5 +775,40 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::CallMs static_cast(&reporter_data), sizeof(ReporterData)); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::GetFpBpPoint( + std::string &fp_point, std::string &bp_point) { + // Env or options mode, fp_point_/bp_point_ have initiliazed on profiling init + if (!fp_point_.empty() && !bp_point_.empty()) { + GELOGI("Bp Fp have been initialized in env or options"); + fp_point = fp_point_; + bp_point = bp_point_; + GELOGI("Bp Fp have been initailized in env or options, bp_point: %s, fp_point: %s", bp_point.c_str(), fp_point.c_str()); + return; + } + // ProfApi mode and training trace is set + try { + char env_profiling_options[MSPROF_OPTIONS_DEF_LEN_MAX] = { 0x00 }; + INT32 ret = mmGetEnv("PROFILING_OPTIONS", env_profiling_options, MSPROF_OPTIONS_DEF_LEN_MAX); + if (ret != EN_OK) { + GELOGI("PROFILING_OPTIONS env is not exist."); + return; + } + GELOGI("Parse env PROFILING_OPTIONS:%s.", env_profiling_options); + Json prof_options = Json::parse(env_profiling_options); + + fp_point_ = prof_options[kFpPoint]; + bp_point_ = prof_options[kBpPoint]; + + fp_point = fp_point_; + bp_point = bp_point_; + if (!fp_point_.empty() && !bp_point_.empty()) { + GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str()); + } + } catch (...) { + GELOGE(FAILED, "Json prof options is invalid."); + return ge::PARAM_INVALID; + } +} + } // namespace ge diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h index c9434a10..c9ceed92 100755 --- a/ge/common/profiling/profiling_manager.h +++ b/ge/common/profiling/profiling_manager.h @@ -63,7 +63,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { bool ProfilingTrainingTraceOn() const { return is_training_trace_; } bool ProfilingModelLoadOn() const { return is_load_profiling_; } bool ProfilingModelExecuteOn() const; - bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } // only used by command pattern + bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } // is_execute_profiling_ only used by ge option and env void ReportProfilingData(uint32_t model_id, const std::vector &task_desc_info, const std::vector &compute_graph_desc_info); void ProfilingTaskDescInfo(uint32_t model_id, const std::vector &task_desc_info, @@ -76,8 +76,10 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { struct MsprofCallback &GetMsprofCallback() { return prof_cb_; } void SetMsprofCtrlCallback(MsprofCtrlCallback func) { prof_cb_.msprofCtrlCallback = func; } void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; } + void GetFpBpPoint(std::string &fp_point, std::string &bp_point); private: Status InitFromOptions(const Options &options, MsprofGeOptions &prof_conf); + Status ParseOptions(const std::string &options); Status ProfParseParam(const std::map &config_para, int32_t &device_num, vector &device_list); Status ProfParseDeviceId(const std::map &config_para, @@ -96,6 +98,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { uint32_t subscribe_count_; std::mutex mutex_; MsprofCallback prof_cb_; + std::string fp_point_; + std::string bp_point_; }; } // namespace ge #endif // GE_COMMON_PROFILING_PROFILING_MANAGER_H_ diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc index 41607f1f..b506f945 100755 --- a/ge/graph/build/task_generator.cc +++ b/ge/graph/build/task_generator.cc @@ -49,8 +49,6 @@ const char *const kIsLastNode = "is_last_node"; const char *const kIsInputVar = "INPUT_IS_VAR"; const char *const kIsOutputVar = "OUTPUT_IS_VAR"; const char *const kProfilingMode = "PROFILING_MODE"; -const char *const kProfilingFpPoint = "FP_POINT"; -const char *const kProfilingBpPoint = "BP_POINT"; const uint32_t kProfilingArStep = 2; const uint64_t kProfilingFpStartLogid = 1; const uint64_t kProfilingBpEndLogid = 2; @@ -810,35 +808,23 @@ Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint vector &all_reduce_nodes, std::string &fp_point_str, std::string &bp_point_str) const { - if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_FPPONIT_OPTIONS, fp_point_str) == SUCCESS && - ge::GetContext().GetOption(OPTION_EXEC_PROFILING_BPPONIT_OPTIONS, bp_point_str) == SUCCESS && - !fp_point_str.empty() && !bp_point_str.empty()) { - return SUCCESS; - } + ProfilingManager::Instance().GetFpBpPoint(fp_point_str, bp_point_str); Status ret = SUCCESS; - const char *fp_point = std::getenv(kProfilingFpPoint); - if (fp_point == nullptr) { + if (fp_point_str.empty()) { ret = AutoFindFpOpIndex(graph, profiling_point); if (ret != SUCCESS) { GELOGW("First forward profiling op_index not set and FindFpOpIndex failed."); return FAILED; } - } else { - fp_point_str = string(fp_point); - GELOGI("Get fp_point_str from env %s", fp_point_str.c_str()); } - const char *bp_point = std::getenv(kProfilingBpPoint); - if (bp_point == nullptr) { + if (bp_point_str.empty()) { ret = AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes); if (ret != SUCCESS) { GELOGW("Last backward profiling op_index not set and FindBpOpIndex failed."); return FAILED; } - } else { - bp_point_str = string(bp_point); - GELOGI("Get bp_point_str from env %s", bp_point_str.c_str()); } return SUCCESS;