From 25cae1a2e7c662055d6eba62aeb6f2e7ab0497a5 Mon Sep 17 00:00:00 2001 From: askmiao Date: Sun, 9 Aug 2020 09:12:30 +0800 Subject: [PATCH] add profiler featrue --- mindspore/ccsrc/CMakeLists.txt | 10 +- mindspore/ccsrc/profiler/CMakeLists.txt | 5 + .../profiler/device/gpu/cupti_interface.cc | 134 ++++ .../profiler/device/gpu/cupti_interface.h | 44 ++ .../profiler/device/gpu/gpu_profiling.cc | 700 ++++++++++++++++++ .../ccsrc/profiler/device/gpu/gpu_profiling.h | 174 +++++ .../runtime/device/gpu/gpu_kernel_runtime.cc | 15 + mindspore/core/utils/log_adapter.cc | 1 + mindspore/core/utils/log_adapter.h | 1 + 9 files changed, 1081 insertions(+), 3 deletions(-) create mode 100644 mindspore/ccsrc/profiler/CMakeLists.txt create mode 100644 mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc create mode 100644 mindspore/ccsrc/profiler/device/gpu/cupti_interface.h create mode 100644 mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc create mode 100644 mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index cd18ff2169..11e830d122 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -30,10 +30,14 @@ if(ENABLE_GPU) if(NOT CUDNN_PATH OR CUDNN_PATH STREQUAL "") set(CUDNN_PATH ${CUDA_PATH}) endif() + if(NOT CUPTI_INCLUDE_DIRS OR CUPTI_INCLUDE_DIRS STREQUAL "") + set(CUPTI_INCLUDE_DIRS ${CUDA_PATH}/extras/CUPTI/include) + endif() message("CUDA_PATH: ${CUDA_PATH}") message("CUDA_INCLUDE_DIRS: ${CUDA_INCLUDE_DIRS}") message("CUDNN_PATH: ${CUDNN_PATH}") - include_directories(${CUDNN_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS}) + message("CUPTI_INCLUDE_DIRS: ${CUPTI_INCLUDE_DIRS}") + include_directories(${CUDNN_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS}) file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "runtime/device/gpu/*.cc" @@ -130,7 +134,7 @@ set(SUB_COMP frontend/operator pipeline/jit pipeline/pynative - common debug pybind_api utils vm + common debug pybind_api utils vm profiler ) foreach (_comp ${SUB_COMP}) @@ -259,7 +263,7 @@ if (ENABLE_GPU) ${CUDNN_PATH}/lib64/libcudnn.so ${CUDA_PATH}/lib64/libcudart.so ${CUDA_PATH}/lib64/stubs/libcuda.so - ${CUDA_PATH}/lib64/libcusolver.so) + ${CUDA_PATH}/lib64/libcusolver.so) if (ENABLE_MPI) set_target_properties(_ms_mpi PROPERTIES INSTALL_RPATH ${ORIGIN_PATH}) endif() diff --git a/mindspore/ccsrc/profiler/CMakeLists.txt b/mindspore/ccsrc/profiler/CMakeLists.txt new file mode 100644 index 0000000000..6e9e1b7423 --- /dev/null +++ b/mindspore/ccsrc/profiler/CMakeLists.txt @@ -0,0 +1,5 @@ +if (ENABLE_GPU) + file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/gpu/*.cc") + set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER) + add_library(_mindspore_profiler_obj OBJECT ${PROFILER_SRC_LIST}) +endif () \ No newline at end of file diff --git a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc new file mode 100644 index 0000000000..b7d60fd958 --- /dev/null +++ b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc @@ -0,0 +1,134 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "utils/log_adapter.h" +#include "profiler/device/gpu/cupti_interface.h" + +namespace mindspore { +namespace profiler { +namespace gpu { + +inline void *LoadLibrary(const char *name) { + auto handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); + if (handle == nullptr) { + MS_LOG(EXCEPTION) << "Load lib " << name << " Please check whether configured the path of CUPTI to LD_LIBRARY_PATH"; + } + return handle; +} + +inline void *GetCUPTIHandle() { + static void *handle = LoadLibrary("libcupti.so"); + return handle; +} + +inline void *GetCUPTIFunc(const char *name) { + static void *handle = GetCUPTIHandle(); + void *func = dlsym(handle, name); + if (func == nullptr) { + MS_LOG(EXCEPTION) << "Load func " << name << " failed, make sure you have implied it!"; + } + return func; +} + +typedef CUptiResult (*CuptiSubscribeFunc)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, + void *userdata); +typedef CUptiResult (*CuptiEnableDomainFunc)(uint32_t enable, CUpti_SubscriberHandle subscriber, + CUpti_CallbackDomain domain); +typedef CUptiResult (*CuptiActivityEnableFunc)(CUpti_ActivityKind kind); +typedef CUptiResult (*CuptiActivityRegisterCallbacksFunc)(CUpti_BuffersCallbackRequestFunc funcBufferRequested, + CUpti_BuffersCallbackCompleteFunc funcBufferCompleted); +typedef CUptiResult (*CuptiUnsubscribeFunc)(CUpti_SubscriberHandle subscriber); +typedef CUptiResult (*CuptiActivityFlushAllFunc)(uint32_t flag); +typedef CUptiResult (*CuptiActivityDisableFunc)(CUpti_ActivityKind kind); +typedef CUptiResult (*CuptiActivityGetNextRecordFunc)(uint8_t *buffer, size_t validBufferSizeBytes, + CUpti_Activity **record); +typedef CUptiResult (*CuptiActivityGetNumDroppedRecordsFunc)(CUcontext context, uint32_t streamId, size_t *dropped); +typedef CUptiResult (*CuptiGetTimestampFunc)(uint64_t *timestamp); +typedef CUptiResult (*CuptiGetResultStringFunc)(CUptiResult result, const char **str); +typedef CUptiResult (*CuptiGetStreamIdFunc)(CUcontext context, CUstream stream, uint32_t *streamId); +typedef CUptiResult (*CuptiGetDeviceIdFunc)(CUcontext context, uint32_t *deviceId); + +CUptiResult CuptiSubscribe(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, void *userdata) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiSubscribe")); + return func_ptr(subscriber, callback, userdata); +} + +CUptiResult CuptiEnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber, CUpti_CallbackDomain domain) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiEnableDomain")); + return func_ptr(enable, subscriber, domain); +} + +CUptiResult CuptiActivityEnable(CUpti_ActivityKind kind) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiActivityEnable")); + return func_ptr(kind); +} + +CUptiResult CuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested, + CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) { + static auto func_ptr = + reinterpret_cast(GetCUPTIFunc("cuptiActivityRegisterCallbacks")); + return func_ptr(funcBufferRequested, funcBufferCompleted); +} + +CUptiResult CuptiUnsubscribe(CUpti_SubscriberHandle subscriber) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiUnsubscribe")); + return func_ptr(subscriber); +} + +CUptiResult CuptiActivityFlushAll(uint32_t flag) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiActivityFlushAll")); + return func_ptr(flag); +} + +CUptiResult CuptiActivityDisable(CUpti_ActivityKind kind) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiActivityDisable")); + return func_ptr(kind); +} + +CUptiResult CuptiActivityGetNextRecord(uint8_t *buffer, size_t validBufferSizeBytes, CUpti_Activity **record) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiActivityGetNextRecord")); + return func_ptr(buffer, validBufferSizeBytes, record); +} + +CUptiResult CuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId, size_t *dropped) { + static auto func_ptr = + reinterpret_cast(GetCUPTIFunc("cuptiActivityGetNumDroppedRecords")); + return func_ptr(context, streamId, dropped); +} + +CUptiResult CuptiGetTimestamp(uint64_t *timestamp) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiGetTimestamp")); + return func_ptr(timestamp); +} + +CUptiResult CuptiGetResultString(CUptiResult result, const char **str) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiGetResultString")); + return func_ptr(result, str); +} + +CUptiResult CuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiGetStreamId")); + return func_ptr(context, stream, streamId); +} + +CUptiResult CuptiGetDeviceId(CUcontext context, uint32_t *deviceId) { + static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiSubscribe")); + return func_ptr(context, deviceId); +} +} // namespace gpu +} // namespace profiler +} // namespace mindspore diff --git a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.h b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.h new file mode 100644 index 0000000000..9c3fa6ab2d --- /dev/null +++ b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.h @@ -0,0 +1,44 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CUPTI_INTERFACE_H +#define MINDSPORE_CUPTI_INTERFACE_H +#ifndef FUNC_EXPORT +#define FUNC_EXPORT __attribute__((visibility("default"))) +#endif +namespace mindspore { +namespace profiler { +namespace gpu { +CUptiResult CuptiSubscribe(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, void *userdata); +CUptiResult CuptiEnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber, CUpti_CallbackDomain domain); +CUptiResult CuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId); +CUptiResult CuptiGetDeviceId(CUcontext context, uint32_t *deviceId); + +CUptiResult CuptiActivityEnable(CUpti_ActivityKind kind); +CUptiResult CuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested, + CUpti_BuffersCallbackCompleteFunc funcBufferCompleted); +CUptiResult CuptiUnsubscribe(CUpti_SubscriberHandle subscriber); +CUptiResult CuptiActivityFlushAll(uint32_t flag); +CUptiResult CuptiActivityDisable(CUpti_ActivityKind kind); +CUptiResult CuptiActivityGetNextRecord(uint8_t *buffer, size_t validBufferSizeBytes, CUpti_Activity **record); +CUptiResult CuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId, size_t *dropped); +CUptiResult CuptiGetTimestamp(uint64_t *timestamp); +CUptiResult CuptiGetResultString(CUptiResult result, const char **str); + +} // namespace gpu +} // namespace profiler +} // namespace mindspore + +#endif // MINDSPORE_CUPTI_INTERFACE_H diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc new file mode 100644 index 0000000000..7719522cb2 --- /dev/null +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc @@ -0,0 +1,700 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "profiler/device/gpu/gpu_profiling.h" +#include "profiler/device/gpu/cupti_interface.h" +#include "utils/log_adapter.h" +#include "pybind_api/api_register.h" + +namespace mindspore { +namespace profiler { +namespace gpu { +#define BUF_SIZE (32 * 1024) +#define ALIGN_SIZE (8) +#define CHECK_CUPTI_RET_WITH_ERROR(expression, message) \ + if (expression != CUPTI_SUCCESS) { \ + const char *errstr; \ + CuptiGetResultString(expression, &errstr); \ + MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << message; \ + } + +#define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message) \ + if (expression != CUPTI_SUCCESS) { \ + const char *errstr; \ + CuptiGetResultString(expression, &errstr); \ + MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << message; \ + } +#define CHECK_CUDA_RET_WITH_ERROR(expression, message) \ + { \ + cudaError_t status = (expression); \ + if (status != cudaSuccess) { \ + MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \ + << cudaGetErrorString(status); \ + } \ + } +#define PROFILER_ERROR_IF_NULLPTR(ptr) \ + do { \ + if ((ptr) == nullptr) { \ + MS_LOG(ERROR) << ": The pointer[" << #ptr << "] is null."; \ + return; \ + } \ + } while (0) + +std::shared_ptr GPUProfiler::profiler_inst_ = nullptr; + +int32_t GetThreadID() { + int32_t thread_id = 0; + thread_id = static_cast(pthread_self()); + return thread_id; +} + +uint32_t GetStreamID(const CUcontext context, const void *stream) { + uint32_t stream_id = 0; + if (stream != nullptr) { + CHECK_CUPTI_RET_WITH_ERROR(CuptiGetStreamId(context, (CUstream)stream, &stream_id), "CuptiGetStreamId"); + } + return stream_id; +} + +uint64_t GetCUPTITimeStamp() { + uint64_t time_stamp = 0l; + CHECK_CUPTI_RET_WITH_ERROR(CuptiGetTimestamp(&time_stamp), "CuptiGetTimestamp"); + return time_stamp; +} + +uint64_t GetHostTimeStamp() { + auto cur_sys_clock = std::chrono::system_clock::now(); + uint64_t cur_time_stamp = + std::chrono::duration_cast(cur_sys_clock.time_since_epoch()).count(); + return cur_time_stamp; +} + +std::string GetKernelFunc(const char *name) { + char *demangledName = abi::__cxa_demangle(name, nullptr, nullptr, nullptr); + if (demangledName != nullptr) { + return demangledName; + } else { + return name; + } +} + +void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, + const CUpti_CallbackData *cb_data) { + if (domain != CUPTI_CB_DOMAIN_DRIVER_API) { + return; + } + auto gpu_profiler_inst = GPUProfiler::GetInstance(); + PROFILER_ERROR_IF_NULLPTR(gpu_profiler_inst); + if (!gpu_profiler_inst->GetEnableFlag()) { + return; + } + + PROFILER_ERROR_IF_NULLPTR(cb_data); + if (cb_data->context == nullptr) { + MS_LOG(DEBUG) << "callback data context is null , correlation Id:" << cb_data->correlationId + << " callback id:" << cb_id; + return; + } + + uint64_t start_timestamp; + uint64_t end_timestamp; + + if (cb_data->callbackSite == CUPTI_API_ENTER) { + *cb_data->correlationData = GetCUPTITimeStamp(); + + } else if (cb_data->callbackSite == CUPTI_API_EXIT) { + start_timestamp = *cb_data->correlationData; + end_timestamp = GetCUPTITimeStamp(); + + switch (cb_id) { + case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: + case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: + case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp); + break; + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp); + break; + case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc: + case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp); + break; + case CUPTI_DRIVER_TRACE_CBID_cuEventCreate: + case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2: + case CUPTI_DRIVER_TRACE_CBID_cuEventRecord: + case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize: + case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime: + // In some cases, the callback of cuctxsetcurrent is only exist + // without entry, so this callback is ignored + case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent: + break; + default: + gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp); + break; + } + } +} + +std::shared_ptr GPUProfiler::GetInstance() { + if (profiler_inst_ == nullptr) { + profiler_inst_ = std::shared_ptr(new (std::nothrow) GPUProfiler()); + } + return profiler_inst_; +} + +void GPUProfiler::SyncEnable(const bool enable_flag) { + MS_LOG(INFO) << "GPU Profiler synchronous enable flag:" << enable_flag; + sync_enable_flag_ = enable_flag; +} + +void GPUProfiler::StepProfilingEnable(const bool enable_flag) { + MS_LOG(INFO) << "GPU Profiler enable flag:" << enable_flag; + CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll"); + enable_flag_ = enable_flag; +} + +void GPUProfiler::FixOpNameByCorrelationId(Event *event) { + PROFILER_ERROR_IF_NULLPTR(event); + if (event->api_type != CUPTIApiType::kActivity) { + return; + } + auto iter = op_name_map_.find(event->correlation_id); + if (iter != op_name_map_.end()) { + event->op_name = std::move(iter->second); + } +} + +void GPUProfiler::AddEvent(Event &&event) { + // protect callback concurrency for driver api and activity + std::unique_lock lock(event_mutex_); + switch (event.api_type) { + case CUPTIApiType::kCallback: { + if (cupti_callback_events_count_ < max_cupti_callback_events_) { + events_.emplace_back(std::move(event)); + cupti_callback_events_count_++; + } else { + cupti_callback_events_drop_count_++; + } + break; + } + case CUPTIApiType::kActivity: { + if (cupti_activity_events_count_ < max_cupti_activity_events_) { + events_.emplace_back(std::move(event)); + cupti_activity_events_count_++; + } else { + cupti_activity_events_drop_count_++; + } + break; + } + default: + break; + } +} + +void GPUProfiler::EventLog(const Event &event) { + MS_LOG(DEBUG) << "GPUProfiler" + << ",\"kernel_name:" << event.kernel_name << "\",kernel_type:" << event.kernel_type + << ",api_type:" << static_cast(event.api_type) << ",start_time_stamp:" << event.start_time_stamp + << ",end_time_stamp:" << event.end_time_stamp << ",cost:," + << (event.end_time_stamp - event.start_time_stamp) / kTimeUnit << ",op_name:" << event.op_name + << ",device_id:" << event.device_id << ",correlation_id:" << event.correlation_id + << ",thread_id:" << event.thread_id << ",context_id:" << event.context_id + << ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id; +} + +void fillActivityInfo(OpInfo *opInfo, const Event &event) { + if (event.api_type != CUPTIApiType::kActivity) { + return; + } + switch (event.activity_type) { + case ActivityType::kKernel: + opInfo->kernel_info.registers_per_thread = event.kernel_info.registers_per_thread; + opInfo->kernel_info.static_shared_memory = event.kernel_info.static_shared_memory; + opInfo->kernel_info.dynamic_shared_memory = event.kernel_info.dynamic_shared_memory; + opInfo->kernel_info.block_x = event.kernel_info.block_x; + opInfo->kernel_info.block_y = event.kernel_info.block_y; + opInfo->kernel_info.block_z = event.kernel_info.block_z; + opInfo->kernel_info.grid_x = event.kernel_info.grid_x; + opInfo->kernel_info.grid_y = event.kernel_info.grid_y; + opInfo->kernel_info.grid_z = event.kernel_info.grid_z; + break; + case ActivityType::kMemcpyH2D: + case ActivityType::kMemcpyD2H: + case ActivityType::kMemcpyH2A: + case ActivityType::kMemcpyA2H: + case ActivityType::kMemcpyA2D: + case ActivityType::kMemcpyD2A: + case ActivityType::kMemcpyP2P: + case ActivityType::kMemcpyH2H: + case ActivityType::kMemset: + case ActivityType::kMemcpyUnknown: + opInfo->memcpy_info.bytes = event.memcpy_info.bytes; + default: + break; + } +} + +void GPUProfiler::OpsParser() { + MS_LOG(INFO) << "Count the number of events size:" << events_.size() + << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_; + + if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) { + MS_LOG(WARNING) + << "The total number of events exceeded the profiler's processing capacity, Some events were discarded." + << " callback api events:" << cupti_activity_events_drop_count_ + << " activity api events:" << cupti_callback_events_drop_count_; + } + + if (events_.size() == 0) { + return; + } + + for (Event &event : events_) { + if (event.op_name.empty()) { + FixOpNameByCorrelationId(&event); + } + + EventLog(event); + + if (event.op_name.empty() || event.cb_id == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) { + continue; + } + + auto iter = op_info_map_.find(event.op_name); + if (iter != op_info_map_.end()) { + switch (event.api_type) { + case CUPTIApiType::kCallback: { + iter->second.op_kernel_api_count += 1; + // The time unit from ns to us + iter->second.cupti_api_call_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit; + break; + } + case CUPTIApiType::kActivity: { + iter->second.op_kernel_count += 1; + // The time unit from ns to us + iter->second.cupti_activity_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit; + fillActivityInfo(&iter->second, event); + break; + } + default: + break; + } + } + } + + MS_LOG(INFO) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|" + ",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|" + ",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time,|" + ",mem_bytes,registers_per_thread,static_shared_memory,dynamic_shared_memory" + ",block_x,block_y,block_z,grid_x,grid_y,grid_z" + << std::endl; + + std::vector> order_vec(op_info_map_.begin(), op_info_map_.end()); + + auto cmp_func = [](const std::pair &a, const std::pair &b) { + return a.second.cupti_activity_time > b.second.cupti_activity_time; + }; + std::sort(order_vec.begin(), order_vec.end(), cmp_func); + + for (auto iter = order_vec.begin(); iter != order_vec.end(); iter++) { + MS_LOG(INFO) << "GPU_profiler" + << "," << iter->first << "," << iter->second.op_count << "," << iter->second.op_kernel_count << "," + << iter->second.op_kernel_api_count << "," + << "|," << iter->second.cupti_activity_time << "," << iter->second.cupti_api_call_time << "," + << round(iter->second.op_host_cost_time) << "," + << "|," << round(iter->second.cupti_activity_time / iter->second.op_count) << "," + << round(iter->second.cupti_api_call_time / iter->second.op_count) << "," + << round(iter->second.op_host_cost_time / iter->second.op_count) << "," + << "|," << iter->second.memcpy_info.bytes << "," << iter->second.kernel_info.registers_per_thread + << "," << iter->second.kernel_info.static_shared_memory << "," + << iter->second.kernel_info.dynamic_shared_memory << "," << iter->second.kernel_info.block_x << "," + << iter->second.kernel_info.block_y << "," << iter->second.kernel_info.block_z << "," + << iter->second.kernel_info.grid_x << "," << iter->second.kernel_info.grid_y << "," + << iter->second.kernel_info.grid_z << std::endl; + } +} + +void GPUProfiler::EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, + const std::string &typestring, uint64_t startTimestamp, uint64_t endTimestamp) { + Event event; + uint32_t device_id = -1; + CuptiGetDeviceId(cbdata->context, &device_id); + event.kernel_name = cbdata->symbolName ? GetKernelFunc(cbdata->symbolName) : cbdata->functionName; + event.kernel_type = typestring; + event.api_type = CUPTIApiType::kCallback; + event.start_time_stamp = startTimestamp; + event.end_time_stamp = endTimestamp; + event.op_name = op_name_; + event.device_id = device_id; + event.correlation_id = cbdata->correlationId; + event.thread_id = GetThreadID(); + event.context_id = cbdata->contextUid; + event.stream_id = GetStreamID(cbdata->context, stream_); + event.cb_id = cbid; + op_name_map_[event.correlation_id] = event.op_name; + AddEvent(std::move(event)); +} + +void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords); + +void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize); + +void GPUProfiler::Init(const std::string &profileDataPath = "") { + MS_LOG(INFO) << "Initialize GPU Profiling"; + CHECK_CUPTI_RET_WITH_EXCEPT(CuptiSubscribe(&subscriber_, (CUpti_CallbackFunc)CUPTICallBackFunc, this), + "CuptiSubscribe"); + CHECK_CUPTI_RET_WITH_EXCEPT(CuptiEnableDomain(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API), "CuptiEnableDomain"); + + activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY); + activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY2); + activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_KERNEL); + + for (std::vector::iterator it = activities_enable_.begin(); it != activities_enable_.end(); + ++it) { + CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityEnable(*it), "CuptiActivityEnable"); + } + + CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityRegisterCallbacks(ActivityAllocBuffer, ActivityProcessBuffer), + "CuptiActivityRegisterCallbacks"); + + base_time_.gpu_start_time = GetCUPTITimeStamp(); + base_time_.host_start_time = GetHostTimeStamp(); + + profile_data_path_ = profileDataPath; + MS_LOG(INFO) << "GPU start time(ns):" << base_time_.gpu_start_time + << " Host start time(ns):" << base_time_.host_start_time << " profile data path: " << profile_data_path_; +} + +void GPUProfiler::SetRunTimeData(const std::string &op_name, void *stream) { + auto iter = op_info_map_.find(op_name); + if (iter != op_info_map_.end()) { + iter->second.op_count += 1; + } else { + OpInfo op_info; + op_info.op_name = op_name; + op_info.stream = stream; + op_info.op_count = 1; + op_info_map_[op_name] = op_info; + } + op_name_ = op_name; + stream_ = stream; +} + +void GPUProfiler::SetRunTimeData(const std::string &op_name, const float time_elapsed) { + auto iter = op_info_map_.find(op_name); + if (iter != op_info_map_.end()) { + // The time unit is ms ,convert to us + iter->second.op_host_cost_time += time_elapsed; + } +} + +void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) { + if (sync_enable_flag_) { + CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_start_), "cudaEventCreate op event start failed"); + CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_stop_), "cudaEventCreate op event stop failed"); + CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_start_, (CUstream)stream_), + "cudaEventRecord op event start failed"); + } else { + op_host_time_start_ = GetHostTimeStamp(); + } + SetRunTimeData(op_name, stream); +} + +void GPUProfiler::OpDataProducerEnd() { + float op_time_elapsed = 0; + if (sync_enable_flag_) { + CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_stop_, (CUstream)stream_), + "cudaEventRecord op event stop failed"); + CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_start_), "cudaEventSynchronize op event start failed"); + CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_stop_), "cudaEventSynchronize op event stop failed"); + CHECK_CUDA_RET_WITH_ERROR(cudaEventElapsedTime(&op_time_elapsed, op_event_start_, op_event_stop_), + "cudaEventElapsedTime failed"); + CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_start_), "cudaEventDestroy op event start failed"); + CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed"); + op_time_elapsed = op_time_elapsed * kTimeUnit; + } else { + op_host_time_stop_ = GetHostTimeStamp(); + op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit; + } + SetRunTimeData(op_name_, op_time_elapsed); +} + +void GPUProfiler::StopCUPTI() { + if (subscriber_ != nullptr) { + CHECK_CUPTI_RET_WITH_ERROR(CuptiUnsubscribe(subscriber_), "CuptiUnsubscribe"); + CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll"); + for (std::vector::iterator it = activities_enable_.begin(); it != activities_enable_.end(); + ++it) { + CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityDisable(*it), "CuptiActivityDisable"); + } + subscriber_ = nullptr; + } +} + +void GPUProfiler::Stop() { + MS_LOG(INFO) << "Stop GPU Profiling"; + StopCUPTI(); + OpsParser(); + SaveProfileData(); +} + +void GPUProfiler::SaveProfileData() { + if (profile_data_path_.empty()) { + MS_LOG(WARNING) << "profile_data_path is empty, skip save profile data."; + return; + } + op_info_map_.clear(); + op_name_map_.clear(); + events_.clear(); +} + +void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { + auto gpu_profiler_inst = GPUProfiler::GetInstance(); + if (gpu_profiler_inst == nullptr) { + MS_LOG(ERROR) << "GPU profiler instance is nullptr"; + return; + } + gpu_profiler_inst->AllocBuffer(buffer, size, maxNumRecords); +} + +void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { + PROFILER_ERROR_IF_NULLPTR(buffer); + GPUProfiler::GetInstance()->ProcessBuffer(ctx, streamId, buffer, size, validSize); +} + +void HandleActivityMemcpyRecord(Event *profillingData, CUpti_Activity *record) { + CUpti_ActivityMemcpy *memcpy = reinterpret_cast(record); + switch (memcpy->copyKind) { + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: + profillingData->activity_type = ActivityType::kMemcpyH2D; + profillingData->kernel_name = "MemcpyH2D"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: + profillingData->activity_type = ActivityType::kMemcpyD2H; + profillingData->kernel_name = "MemcpyD2H"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: + profillingData->activity_type = ActivityType::kMemcpyH2A; + profillingData->kernel_name = "MemcpyH2A"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: + profillingData->activity_type = ActivityType::kMemcpyA2H; + profillingData->kernel_name = "MemcpyA2H"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: + profillingData->activity_type = ActivityType::kMemcpyA2D; + profillingData->kernel_name = "MemcpyA2D"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: + profillingData->activity_type = ActivityType::kMemcpyD2A; + profillingData->kernel_name = "MemcpyD2A"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: + profillingData->activity_type = ActivityType::kMemcpyD2D; + profillingData->kernel_name = "MemcpyD2D"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: + profillingData->activity_type = ActivityType::kMemcpyH2H; + profillingData->kernel_name = "MemcpyH2H"; + break; + case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: + profillingData->activity_type = ActivityType::kMemcpyP2P; + profillingData->kernel_name = "MemcpyP2P"; + break; + default: + profillingData->activity_type = ActivityType::kMemcpyUnknown; + profillingData->kernel_name = "MemcpyUnknown"; + break; + } + profillingData->kernel_type = "cuMemcpy"; + profillingData->api_type = CUPTIApiType::kActivity; + profillingData->start_time_stamp = memcpy->start; + profillingData->end_time_stamp = memcpy->end; + profillingData->device_id = memcpy->deviceId; + profillingData->context_id = memcpy->contextId; + profillingData->stream_id = memcpy->streamId; + profillingData->correlation_id = memcpy->correlationId; + profillingData->memcpy_info.bytes = memcpy->bytes; + profillingData->memcpy_info.src_kind = memcpy->srcKind; + profillingData->memcpy_info.dst_kind = memcpy->dstKind; +} + +void HandleActivityMemcpy2Record(Event *profillingData, CUpti_Activity *record) { + CUpti_ActivityMemcpy2 *memcpyP2P = reinterpret_cast(record); + profillingData->activity_type = ActivityType::kMemcpyP2P; + profillingData->kernel_name = "MemcpyP2P"; + profillingData->kernel_type = "cuMemcpy"; + profillingData->api_type = CUPTIApiType::kActivity; + profillingData->start_time_stamp = memcpyP2P->start; + profillingData->end_time_stamp = memcpyP2P->end; + profillingData->device_id = memcpyP2P->deviceId; + profillingData->context_id = memcpyP2P->contextId; + profillingData->stream_id = memcpyP2P->streamId; + profillingData->correlation_id = memcpyP2P->correlationId; + profillingData->memcpy_info.bytes = memcpyP2P->bytes; + profillingData->memcpy_info.src_kind = memcpyP2P->srcKind; + profillingData->memcpy_info.dst_kind = memcpyP2P->dstKind; +} + +void HandleActivityMemsetRecord(Event *profillingData, CUpti_Activity *record) { + CUpti_ActivityMemset *memset = reinterpret_cast(record); + profillingData->activity_type = ActivityType::kMemset; + profillingData->kernel_name = "MemorySet"; + profillingData->api_type = CUPTIApiType::kActivity; + profillingData->start_time_stamp = memset->start; + profillingData->end_time_stamp = memset->end; + profillingData->device_id = memset->deviceId; + profillingData->context_id = memset->contextId; + profillingData->stream_id = memset->streamId; + profillingData->correlation_id = memset->correlationId; + profillingData->memcpy_info.bytes = memset->bytes; +} + +void HandleActivityKernelRecord(Event *profillingData, CUpti_Activity *record) { + CUpti_ActivityKernel4 *kernel = reinterpret_cast(record); + profillingData->activity_type = ActivityType::kKernel; + profillingData->api_type = CUPTIApiType::kActivity; + profillingData->kernel_name = GetKernelFunc(kernel->name); + profillingData->kernel_type = "cuLaunchKernel"; + profillingData->start_time_stamp = kernel->start; + profillingData->end_time_stamp = kernel->end; + profillingData->device_id = kernel->deviceId; + profillingData->context_id = kernel->contextId; + profillingData->stream_id = kernel->streamId; + profillingData->correlation_id = kernel->correlationId; + profillingData->kernel_info.registers_per_thread = kernel->registersPerThread; + profillingData->kernel_info.static_shared_memory = kernel->staticSharedMemory; + profillingData->kernel_info.dynamic_shared_memory = kernel->dynamicSharedMemory; + profillingData->kernel_info.block_x = kernel->blockX; + profillingData->kernel_info.block_y = kernel->blockY; + profillingData->kernel_info.block_z = kernel->blockZ; + profillingData->kernel_info.grid_x = kernel->gridX; + profillingData->kernel_info.grid_y = kernel->gridY; + profillingData->kernel_info.grid_z = kernel->gridZ; +} + +void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) { + PROFILER_ERROR_IF_NULLPTR(record); + Event profillingData; + profillingData.cb_id = 0; + switch (record->kind) { + case CUPTI_ACTIVITY_KIND_MEMCPY: { + HandleActivityMemcpyRecord(&profillingData, record); + break; + } + case CUPTI_ACTIVITY_KIND_MEMCPY2: { + HandleActivityMemcpy2Record(&profillingData, record); + break; + } + case CUPTI_ACTIVITY_KIND_MEMSET: { + HandleActivityMemsetRecord(&profillingData, record); + break; + } + case CUPTI_ACTIVITY_KIND_KERNEL: + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { + HandleActivityKernelRecord(&profillingData, record); + break; + } + default: + MS_LOG(WARNING) << "unknown activity type!"; + return; + } + + AddEvent(std::move(profillingData)); +} + +void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { + int stat = posix_memalign(reinterpret_cast(buffer), ALIGN_SIZE, BUF_SIZE); + if (stat) { + MS_LOG(ERROR) << "Out of memory, activity buffer alloc failed."; + return; + } + + *size = BUF_SIZE; + *maxNumRecords = 0; +} + +void CUPTIAPI GPUProfiler::ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, + size_t validSize) { + if (!enable_flag_) { + free(buffer); + return; + } + CUptiResult status; + CUpti_Activity *record = NULL; + + if (validSize > 0) { + do { + status = CuptiActivityGetNextRecord(buffer, validSize, &record); + if (status == CUPTI_SUCCESS) { + HandleActivityRecord(record); + } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { + break; + } else { + CHECK_CUPTI_RET_WITH_ERROR(status, "CuptiActivityGetNextRecord"); + } + } while (1); + + // report any records dropped from the queue + size_t dropped; + CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped), + "CuptiActivityGetNumDroppedRecords"); + if (dropped != 0) { + MS_LOG(INFO) << "Dropped " << (unsigned int)dropped << " activity records\n"; + } + } + + free(buffer); +} + +REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) { + (void)py::class_>(*m, "GPUProfiler") + .def_static("get_instance", &GPUProfiler::GetInstance, "GPUProfiler get_instance.") + .def("init", &GPUProfiler::Init, py::arg("profile_data_path"), "init") + .def("stop", &GPUProfiler::Stop, "stop") + .def("step_profiling_enable", &GPUProfiler::StepProfilingEnable, py::arg("enable_flag"), + "enable or disable step profiling") + .def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"), + "enable or disable synchronization profiling"); + })); + +} // namespace gpu +} // namespace profiler +} // namespace mindspore diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h new file mode 100644 index 0000000000..ad21774d82 --- /dev/null +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h @@ -0,0 +1,174 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_GPU_PROFILING_H +#define MINDSPORE_GPU_PROFILING_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mindspore { +namespace profiler { +namespace gpu { +enum class CUPTIApiType { kCallback = 0, kActivity = 1 }; +enum class ActivityType { + kKernel = 0, + kMemcpyH2D = 1, + kMemcpyD2H = 2, + kMemcpyH2A = 3, + kMemcpyA2H = 4, + kMemcpyA2D = 5, + kMemcpyD2A = 6, + kMemcpyD2D = 7, + kMemcpyP2P = 8, + kMemcpyH2H = 9, + kMemset = 10, + kMemcpyUnknown = 11 +}; + +struct MemcpyInfo { + size_t bytes; + unsigned char src_kind; + unsigned char dst_kind; +}; + +struct KernelInfo { + uint64_t registers_per_thread; + uint64_t static_shared_memory; + uint64_t dynamic_shared_memory; + uint64_t block_x; + uint64_t block_y; + uint64_t block_z; + uint64_t grid_x; + uint64_t grid_y; + uint64_t grid_z; +}; + +struct Event { + std::string kernel_name; + std::string kernel_type; + CUPTIApiType api_type; + ActivityType activity_type; + uint64_t start_time_stamp; + uint64_t end_time_stamp; + std::string op_name; + uint32_t device_id; + uint32_t correlation_id; + uint32_t thread_id; + int64_t context_id; + uint32_t stream_id; + CUpti_CallbackId cb_id; + union { + MemcpyInfo memcpy_info; + KernelInfo kernel_info; + }; +}; + +struct OpInfo { + std::string op_name; + float cupti_api_call_time = 0l; + float cupti_activity_time = 0l; + float op_host_cost_time = 0; + int op_kernel_api_count = 0; + int op_kernel_count = 0; + int op_count = 0; + void *stream; + + MemcpyInfo memcpy_info = {0}; + KernelInfo kernel_info = {0}; +}; + +struct BaseTime { + // nanosecond + uint64_t host_start_time = 0l; + uint64_t gpu_start_time = 0l; +}; + +const float kTimeUnit = 1000; + +class GPUProfiler { + public: + static std::shared_ptr GetInstance(); + ~GPUProfiler() { StopCUPTI(); } + GPUProfiler(const GPUProfiler &) = delete; + GPUProfiler &operator=(const GPUProfiler &) = delete; + + void Init(const std::string &profileDataPath); + void Stop(); + void StopCUPTI(); + void StepProfilingEnable(const bool enable_flag); + void SyncEnable(const bool enable_flag); + bool GetEnableFlag() const { return enable_flag_; } + bool GetSyncEnableFlag() const { return sync_enable_flag_; } + void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring, + uint64_t startTimestamp, uint64_t endTimestamp); + void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords); + void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize); + void OpDataProducerBegin(const std::string op_name, void *stream); + void OpDataProducerEnd(); + + private: + GPUProfiler() = default; + void OpsParser(); + void EventLog(const Event &event); + + void HandleActivityRecord(CUpti_Activity *record); + void AddEvent(Event &&event); + void SetRunTimeData(const std::string &op_name, void *stream); + void SetRunTimeData(const std::string &op_name, const float time_elapsed); + void FixOpNameByCorrelationId(Event *event); + + static std::shared_ptr profiler_inst_; + bool enable_flag_ = false; + bool sync_enable_flag_ = true; + std::unordered_map op_info_map_; + std::unordered_map op_name_map_; + std::vector events_; + BaseTime base_time_; + std::string op_name_; + void *stream_; + void SaveProfileData(); + std::mutex event_mutex_; + + std::vector activities_enable_; + + uint64_t cupti_callback_events_count_ = 0l; + uint64_t cupti_callback_events_drop_count_ = 0l; + uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000; + + uint64_t cupti_activity_events_count_ = 0l; + uint64_t cupti_activity_events_drop_count_ = 0l; + uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000; + + CUpti_SubscriberHandle subscriber_ = nullptr; + cudaEvent_t op_event_start_; + cudaEvent_t op_event_stop_; + uint64_t op_host_time_start_; + uint64_t op_host_time_stop_; + std::string profile_data_path_; +}; +} // namespace gpu +} // namespace profiler +} // namespace mindspore + +#endif // MINDSPORE_GPU_PROFILING_H diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 676df13aed..d917679eda 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -31,6 +31,7 @@ #include "runtime/device/gpu/gpu_memory_copy_manager.h" #include "common/trans.h" #include "ir/dtype.h" +#include "profiler/device/gpu/gpu_profiling.h" #ifdef ENABLE_DEBUGGER #include "debug/debug_services.h" #endif @@ -672,6 +673,11 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De auto &kernels = graph->execution_order(); int exec_order = 1; + auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); + if (profiler_inst == nullptr) { + MS_LOG(ERROR) << "gpu profiler instance is nullptr"; + } + for (const auto &kernel : kernels) { auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); @@ -690,8 +696,17 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De } if (!mock) { if (!profiling) { + if (profiler_inst->GetEnableFlag()) { + profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_); + } CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_), "Launch kernel failed."); + if (profiler_inst->GetEnableFlag()) { + profiler_inst->OpDataProducerEnd(); + if (profiler_inst->GetSyncEnableFlag()) { + CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed."); + } + } } else { LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); } diff --git a/mindspore/core/utils/log_adapter.cc b/mindspore/core/utils/log_adapter.cc index 0c5528353b..070de0351d 100644 --- a/mindspore/core/utils/log_adapter.cc +++ b/mindspore/core/utils/log_adapter.cc @@ -180,6 +180,7 @@ static const char *GetSubModuleName(SubModuleId module_id) { "SESSION", // SM_SESSION "UTILS", // SM_UTILS "VM", // SM_VM + "PROFILER" // SM_PROFILER }; return sub_module_names[module_id % NUM_SUBMODUES]; diff --git a/mindspore/core/utils/log_adapter.h b/mindspore/core/utils/log_adapter.h index 3b760836fa..8e320e108f 100644 --- a/mindspore/core/utils/log_adapter.h +++ b/mindspore/core/utils/log_adapter.h @@ -123,6 +123,7 @@ enum SubModuleId : int { SM_SESSION, // session SM_UTILS, // utils SM_VM, // VM + SM_PROFILER, // profiler NUM_SUBMODUES // number of submodules };