diff --git a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc index 2958ba243e..0eb55a6db2 100644 --- a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc +++ b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc @@ -21,7 +21,6 @@ namespace mindspore { namespace profiler { namespace gpu { - inline void *LoadLibrary(const char *name) { auto handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); if (handle == nullptr) { diff --git a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.h b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.h index 9c3fa6ab2d..0f43c7c240 100644 --- a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.h +++ b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.h @@ -36,7 +36,6 @@ CUptiResult CuptiActivityGetNextRecord(uint8_t *buffer, size_t validBufferSizeBy CUptiResult CuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId, size_t *dropped); CUptiResult CuptiGetTimestamp(uint64_t *timestamp); CUptiResult CuptiGetResultString(CUptiResult result, const char **str); - } // namespace gpu } // namespace profiler } // namespace mindspore diff --git a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc index 57e922deae..30e9e75196 100644 --- a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc +++ b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc @@ -22,7 +22,6 @@ namespace mindspore { namespace profiler { namespace gpu { - OpDetailInfo::OpDetailInfo(std::shared_ptr op_info, float proportion) : op_info_(op_info), proportion_(proportion) { // op_full_name is like 'xxx/xxx/{op_type}-op{node_id}' @@ -256,7 +255,6 @@ void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) { } ofs.close(); } - } // namespace gpu } // namespace profiler } // namespace mindspore diff --git a/mindspore/ccsrc/profiler/device/gpu/data_saver.h b/mindspore/ccsrc/profiler/device/gpu/data_saver.h index 76a703ed63..b215238abc 100644 --- a/mindspore/ccsrc/profiler/device/gpu/data_saver.h +++ b/mindspore/ccsrc/profiler/device/gpu/data_saver.h @@ -25,7 +25,6 @@ namespace mindspore { namespace profiler { namespace gpu { - struct OpDetailInfo { std::string op_type_; std::string op_name_; diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc index 0e10e522f9..a0f3bb83ea 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc @@ -60,8 +60,7 @@ namespace gpu { std::shared_ptr GPUProfiler::profiler_inst_ = nullptr; int32_t GetThreadID() { - uint32_t thread_id = 0; - thread_id = static_cast(pthread_self()); + uint32_t thread_id = static_cast(pthread_self()); return thread_id; } @@ -95,6 +94,59 @@ std::string GetKernelFunc(const char *name) { } } +void CUPTIApiExit(const std::shared_ptr &gpu_profiler_inst, CUpti_CallbackId cb_id, + const CUpti_CallbackData *cb_data) { + uint64_t start_timestamp = *cb_data->correlationData; + uint64_t end_timestamp = GetCUPTITimeStamp(); + switch (cb_id) { + case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: + case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: + case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp); + break; + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp); + break; + case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc: + case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp); + break; + case CUPTI_DRIVER_TRACE_CBID_cuEventCreate: + case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2: + case CUPTI_DRIVER_TRACE_CBID_cuEventRecord: + case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize: + case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime: + // In some cases, the callback of cuctxsetcurrent is only exist + // without entry, so this callback is ignored + case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent: + break; + default: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp); + break; + } +} + void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, const CUpti_CallbackData *cb_data) { if (domain != CUPTI_CB_DOMAIN_DRIVER_API) { @@ -113,63 +165,10 @@ void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_Callb return; } - uint64_t start_timestamp; - uint64_t end_timestamp; - if (cb_data->callbackSite == CUPTI_API_ENTER) { *cb_data->correlationData = GetCUPTITimeStamp(); - } else if (cb_data->callbackSite == CUPTI_API_EXIT) { - start_timestamp = *cb_data->correlationData; - end_timestamp = GetCUPTITimeStamp(); - - switch (cb_id) { - case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: - case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: - case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: - gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp); - break; - case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: - case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: - gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp); - break; - case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc: - case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2: - gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp); - break; - case CUPTI_DRIVER_TRACE_CBID_cuEventCreate: - case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2: - case CUPTI_DRIVER_TRACE_CBID_cuEventRecord: - case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize: - case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime: - // In some cases, the callback of cuctxsetcurrent is only exist - // without entry, so this callback is ignored - case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent: - break; - default: - gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp); - break; - } + CUPTIApiExit(gpu_profiler_inst, cb_id, cb_data); } } @@ -240,21 +239,7 @@ void GPUProfiler::EventLog(const Event &event) { << ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id; } -void GPUProfiler::OpsParser() { - MS_LOG(INFO) << "Count the number of events size:" << events_.size() - << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_; - - if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) { - MS_LOG(WARNING) - << "The total number of events exceeded the profiler's processing capacity, Some events were discarded." - << " callback api events:" << cupti_activity_events_drop_count_ - << " activity api events:" << cupti_callback_events_drop_count_; - } - - if (events_.size() == 0) { - return; - } - +void GPUProfiler::ProcessEvents() { for (Event &event : events_) { if (event.op_name.empty()) { FixOpNameByCorrelationId(&event); @@ -286,7 +271,24 @@ void GPUProfiler::OpsParser() { } } } +} + +void GPUProfiler::OpsParser() { + MS_LOG(INFO) << "Count the number of events size:" << events_.size() + << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_; + + if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) { + MS_LOG(WARNING) + << "The total number of events exceeded the profiler's processing capacity, some events were discarded." + << " activity api events:" << cupti_activity_events_drop_count_ + << " callback api events:" << cupti_callback_events_drop_count_; + } + if (events_.size() == 0) { + return; + } + + ProcessEvents(); MS_LOG(DEBUG) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|" ",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|" ",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time" @@ -490,8 +492,7 @@ void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *b GPUProfiler::GetInstance()->ProcessBuffer(ctx, streamId, buffer, size, validSize); } -void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) { - CUpti_ActivityMemcpy *memcpy = reinterpret_cast(record); +void ProcessActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record, CUpti_ActivityMemcpy *memcpy) { switch (memcpy->copyKind) { case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: profilingData->activity_type = ActivityType::kMemcpyH2D; @@ -534,6 +535,12 @@ void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) { profilingData->kernel_name = "MemcpyUnknown"; break; } +} + +void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) { + CUpti_ActivityMemcpy *memcpy = reinterpret_cast(record); + ProcessActivityMemcpyRecord(profilingData, record, memcpy); + profilingData->kernel_type = "cuMemcpy"; profilingData->api_type = CUPTIApiType::kActivity; profilingData->start_time_stamp = memcpy->start; @@ -687,7 +694,6 @@ REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) { .def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"), "enable or disable synchronization profiling"); })); - } // namespace gpu } // namespace profiler } // namespace mindspore diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h index 133bf3a2e1..f510a63bd7 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h @@ -129,6 +129,7 @@ class GPUProfiler { void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize); void OpDataProducerBegin(const std::string op_name, void *stream); void OpDataProducerEnd(); + void ProcessEvents(); private: GPUProfiler() = default;