!4235 add gpu profiler feature

Merge pull request !4235 from 治愈系潇洒哥/master
pull/4235/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 21014fd624

@ -30,10 +30,14 @@ if(ENABLE_GPU)
if(NOT CUDNN_PATH OR CUDNN_PATH STREQUAL "")
set(CUDNN_PATH ${CUDA_PATH})
endif()
if(NOT CUPTI_INCLUDE_DIRS OR CUPTI_INCLUDE_DIRS STREQUAL "")
set(CUPTI_INCLUDE_DIRS ${CUDA_PATH}/extras/CUPTI/include)
endif()
message("CUDA_PATH: ${CUDA_PATH}")
message("CUDA_INCLUDE_DIRS: ${CUDA_INCLUDE_DIRS}")
message("CUDNN_PATH: ${CUDNN_PATH}")
include_directories(${CUDNN_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS})
message("CUPTI_INCLUDE_DIRS: ${CUPTI_INCLUDE_DIRS}")
include_directories(${CUDNN_PATH} ${CUDA_PATH} ${CUDA_INCLUDE_DIRS} ${CUPTI_INCLUDE_DIRS})
file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"runtime/device/gpu/*.cc"
@ -130,7 +134,7 @@ set(SUB_COMP
frontend/operator
pipeline/jit
pipeline/pynative
common debug pybind_api utils vm
common debug pybind_api utils vm profiler
)
foreach (_comp ${SUB_COMP})
@ -261,7 +265,7 @@ if (ENABLE_GPU)
${CUDNN_PATH}/lib64/libcudnn.so
${CUDA_PATH}/lib64/libcudart.so
${CUDA_PATH}/lib64/stubs/libcuda.so
${CUDA_PATH}/lib64/libcusolver.so)
${CUDA_PATH}/lib64/libcusolver.so)
if (ENABLE_MPI)
set_target_properties(_ms_mpi PROPERTIES INSTALL_RPATH ${ORIGIN_PATH})
endif()

@ -0,0 +1,5 @@
if (ENABLE_GPU)
file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/gpu/*.cc")
set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER)
add_library(_mindspore_profiler_obj OBJECT ${PROFILER_SRC_LIST})
endif ()

@ -0,0 +1,134 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cupti.h>
#include <dlfcn.h>
#include "utils/log_adapter.h"
#include "profiler/device/gpu/cupti_interface.h"
namespace mindspore {
namespace profiler {
namespace gpu {
inline void *LoadLibrary(const char *name) {
auto handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
if (handle == nullptr) {
MS_LOG(EXCEPTION) << "Load lib " << name << " Please check whether configured the path of CUPTI to LD_LIBRARY_PATH";
}
return handle;
}
inline void *GetCUPTIHandle() {
static void *handle = LoadLibrary("libcupti.so");
return handle;
}
inline void *GetCUPTIFunc(const char *name) {
static void *handle = GetCUPTIHandle();
void *func = dlsym(handle, name);
if (func == nullptr) {
MS_LOG(EXCEPTION) << "Load func " << name << " failed, make sure you have implied it!";
}
return func;
}
typedef CUptiResult (*CuptiSubscribeFunc)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback,
void *userdata);
typedef CUptiResult (*CuptiEnableDomainFunc)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain);
typedef CUptiResult (*CuptiActivityEnableFunc)(CUpti_ActivityKind kind);
typedef CUptiResult (*CuptiActivityRegisterCallbacksFunc)(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
CUpti_BuffersCallbackCompleteFunc funcBufferCompleted);
typedef CUptiResult (*CuptiUnsubscribeFunc)(CUpti_SubscriberHandle subscriber);
typedef CUptiResult (*CuptiActivityFlushAllFunc)(uint32_t flag);
typedef CUptiResult (*CuptiActivityDisableFunc)(CUpti_ActivityKind kind);
typedef CUptiResult (*CuptiActivityGetNextRecordFunc)(uint8_t *buffer, size_t validBufferSizeBytes,
CUpti_Activity **record);
typedef CUptiResult (*CuptiActivityGetNumDroppedRecordsFunc)(CUcontext context, uint32_t streamId, size_t *dropped);
typedef CUptiResult (*CuptiGetTimestampFunc)(uint64_t *timestamp);
typedef CUptiResult (*CuptiGetResultStringFunc)(CUptiResult result, const char **str);
typedef CUptiResult (*CuptiGetStreamIdFunc)(CUcontext context, CUstream stream, uint32_t *streamId);
typedef CUptiResult (*CuptiGetDeviceIdFunc)(CUcontext context, uint32_t *deviceId);
CUptiResult CuptiSubscribe(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, void *userdata) {
static auto func_ptr = reinterpret_cast<CuptiSubscribeFunc>(GetCUPTIFunc("cuptiSubscribe"));
return func_ptr(subscriber, callback, userdata);
}
CUptiResult CuptiEnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber, CUpti_CallbackDomain domain) {
static auto func_ptr = reinterpret_cast<CuptiEnableDomainFunc>(GetCUPTIFunc("cuptiEnableDomain"));
return func_ptr(enable, subscriber, domain);
}
CUptiResult CuptiActivityEnable(CUpti_ActivityKind kind) {
static auto func_ptr = reinterpret_cast<CuptiActivityEnableFunc>(GetCUPTIFunc("cuptiActivityEnable"));
return func_ptr(kind);
}
CUptiResult CuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
static auto func_ptr =
reinterpret_cast<CuptiActivityRegisterCallbacksFunc>(GetCUPTIFunc("cuptiActivityRegisterCallbacks"));
return func_ptr(funcBufferRequested, funcBufferCompleted);
}
CUptiResult CuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
static auto func_ptr = reinterpret_cast<CuptiUnsubscribeFunc>(GetCUPTIFunc("cuptiUnsubscribe"));
return func_ptr(subscriber);
}
CUptiResult CuptiActivityFlushAll(uint32_t flag) {
static auto func_ptr = reinterpret_cast<CuptiActivityFlushAllFunc>(GetCUPTIFunc("cuptiActivityFlushAll"));
return func_ptr(flag);
}
CUptiResult CuptiActivityDisable(CUpti_ActivityKind kind) {
static auto func_ptr = reinterpret_cast<CuptiActivityDisableFunc>(GetCUPTIFunc("cuptiActivityDisable"));
return func_ptr(kind);
}
CUptiResult CuptiActivityGetNextRecord(uint8_t *buffer, size_t validBufferSizeBytes, CUpti_Activity **record) {
static auto func_ptr = reinterpret_cast<CuptiActivityGetNextRecordFunc>(GetCUPTIFunc("cuptiActivityGetNextRecord"));
return func_ptr(buffer, validBufferSizeBytes, record);
}
CUptiResult CuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId, size_t *dropped) {
static auto func_ptr =
reinterpret_cast<CuptiActivityGetNumDroppedRecordsFunc>(GetCUPTIFunc("cuptiActivityGetNumDroppedRecords"));
return func_ptr(context, streamId, dropped);
}
CUptiResult CuptiGetTimestamp(uint64_t *timestamp) {
static auto func_ptr = reinterpret_cast<CuptiGetTimestampFunc>(GetCUPTIFunc("cuptiGetTimestamp"));
return func_ptr(timestamp);
}
CUptiResult CuptiGetResultString(CUptiResult result, const char **str) {
static auto func_ptr = reinterpret_cast<CuptiGetResultStringFunc>(GetCUPTIFunc("cuptiGetResultString"));
return func_ptr(result, str);
}
CUptiResult CuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId) {
static auto func_ptr = reinterpret_cast<CuptiGetStreamIdFunc>(GetCUPTIFunc("cuptiGetStreamId"));
return func_ptr(context, stream, streamId);
}
CUptiResult CuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
static auto func_ptr = reinterpret_cast<CuptiGetDeviceIdFunc>(GetCUPTIFunc("cuptiSubscribe"));
return func_ptr(context, deviceId);
}
} // namespace gpu
} // namespace profiler
} // namespace mindspore

@ -0,0 +1,44 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CUPTI_INTERFACE_H
#define MINDSPORE_CUPTI_INTERFACE_H
#ifndef FUNC_EXPORT
#define FUNC_EXPORT __attribute__((visibility("default")))
#endif
namespace mindspore {
namespace profiler {
namespace gpu {
CUptiResult CuptiSubscribe(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, void *userdata);
CUptiResult CuptiEnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber, CUpti_CallbackDomain domain);
CUptiResult CuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId);
CUptiResult CuptiGetDeviceId(CUcontext context, uint32_t *deviceId);
CUptiResult CuptiActivityEnable(CUpti_ActivityKind kind);
CUptiResult CuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
CUpti_BuffersCallbackCompleteFunc funcBufferCompleted);
CUptiResult CuptiUnsubscribe(CUpti_SubscriberHandle subscriber);
CUptiResult CuptiActivityFlushAll(uint32_t flag);
CUptiResult CuptiActivityDisable(CUpti_ActivityKind kind);
CUptiResult CuptiActivityGetNextRecord(uint8_t *buffer, size_t validBufferSizeBytes, CUpti_Activity **record);
CUptiResult CuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId, size_t *dropped);
CUptiResult CuptiGetTimestamp(uint64_t *timestamp);
CUptiResult CuptiGetResultString(CUptiResult result, const char **str);
} // namespace gpu
} // namespace profiler
} // namespace mindspore
#endif // MINDSPORE_CUPTI_INTERFACE_H

File diff suppressed because it is too large Load Diff

@ -0,0 +1,174 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_GPU_PROFILING_H
#define MINDSPORE_GPU_PROFILING_H
#include <cuda.h>
#include <cupti.h>
#include <cstdio>
#include <unordered_map>
#include <string>
#include <vector>
#include <mutex>
#include <memory>
#include <algorithm>
#include <utility>
namespace mindspore {
namespace profiler {
namespace gpu {
enum class CUPTIApiType { kCallback = 0, kActivity = 1 };
enum class ActivityType {
kKernel = 0,
kMemcpyH2D = 1,
kMemcpyD2H = 2,
kMemcpyH2A = 3,
kMemcpyA2H = 4,
kMemcpyA2D = 5,
kMemcpyD2A = 6,
kMemcpyD2D = 7,
kMemcpyP2P = 8,
kMemcpyH2H = 9,
kMemset = 10,
kMemcpyUnknown = 11
};
struct MemcpyInfo {
size_t bytes;
unsigned char src_kind;
unsigned char dst_kind;
};
struct KernelInfo {
uint64_t registers_per_thread;
uint64_t static_shared_memory;
uint64_t dynamic_shared_memory;
uint64_t block_x;
uint64_t block_y;
uint64_t block_z;
uint64_t grid_x;
uint64_t grid_y;
uint64_t grid_z;
};
struct Event {
std::string kernel_name;
std::string kernel_type;
CUPTIApiType api_type;
ActivityType activity_type;
uint64_t start_time_stamp;
uint64_t end_time_stamp;
std::string op_name;
uint32_t device_id;
uint32_t correlation_id;
uint32_t thread_id;
int64_t context_id;
uint32_t stream_id;
CUpti_CallbackId cb_id;
union {
MemcpyInfo memcpy_info;
KernelInfo kernel_info;
};
};
struct OpInfo {
std::string op_name;
float cupti_api_call_time = 0l;
float cupti_activity_time = 0l;
float op_host_cost_time = 0;
int op_kernel_api_count = 0;
int op_kernel_count = 0;
int op_count = 0;
void *stream;
MemcpyInfo memcpy_info = {0};
KernelInfo kernel_info = {0};
};
struct BaseTime {
// nanosecond
uint64_t host_start_time = 0l;
uint64_t gpu_start_time = 0l;
};
const float kTimeUnit = 1000;
class GPUProfiler {
public:
static std::shared_ptr<GPUProfiler> GetInstance();
~GPUProfiler() { StopCUPTI(); }
GPUProfiler(const GPUProfiler &) = delete;
GPUProfiler &operator=(const GPUProfiler &) = delete;
void Init(const std::string &profileDataPath);
void Stop();
void StopCUPTI();
void StepProfilingEnable(const bool enable_flag);
void SyncEnable(const bool enable_flag);
bool GetEnableFlag() const { return enable_flag_; }
bool GetSyncEnableFlag() const { return sync_enable_flag_; }
void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring,
uint64_t startTimestamp, uint64_t endTimestamp);
void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
void OpDataProducerBegin(const std::string op_name, void *stream);
void OpDataProducerEnd();
private:
GPUProfiler() = default;
void OpsParser();
void EventLog(const Event &event);
void HandleActivityRecord(CUpti_Activity *record);
void AddEvent(Event &&event);
void SetRunTimeData(const std::string &op_name, void *stream);
void SetRunTimeData(const std::string &op_name, const float time_elapsed);
void FixOpNameByCorrelationId(Event *event);
static std::shared_ptr<GPUProfiler> profiler_inst_;
bool enable_flag_ = false;
bool sync_enable_flag_ = true;
std::unordered_map<std::string, OpInfo> op_info_map_;
std::unordered_map<uint32_t, std::string> op_name_map_;
std::vector<Event> events_;
BaseTime base_time_;
std::string op_name_;
void *stream_;
void SaveProfileData();
std::mutex event_mutex_;
std::vector<CUpti_ActivityKind> activities_enable_;
uint64_t cupti_callback_events_count_ = 0l;
uint64_t cupti_callback_events_drop_count_ = 0l;
uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000;
uint64_t cupti_activity_events_count_ = 0l;
uint64_t cupti_activity_events_drop_count_ = 0l;
uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000;
CUpti_SubscriberHandle subscriber_ = nullptr;
cudaEvent_t op_event_start_;
cudaEvent_t op_event_stop_;
uint64_t op_host_time_start_;
uint64_t op_host_time_stop_;
std::string profile_data_path_;
};
} // namespace gpu
} // namespace profiler
} // namespace mindspore
#endif // MINDSPORE_GPU_PROFILING_H

@ -31,6 +31,7 @@
#include "runtime/device/gpu/gpu_memory_copy_manager.h"
#include "common/trans.h"
#include "ir/dtype.h"
#include "profiler/device/gpu/gpu_profiling.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#endif
@ -670,6 +671,11 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
auto &kernels = graph->execution_order();
int exec_order = 1;
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
if (profiler_inst == nullptr) {
MS_LOG(ERROR) << "gpu profiler instance is nullptr";
}
for (const auto &kernel : kernels) {
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
@ -688,8 +694,17 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
}
if (!mock) {
if (!profiling) {
if (profiler_inst->GetEnableFlag()) {
profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_);
}
CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_),
"Launch kernel failed.");
if (profiler_inst->GetEnableFlag()) {
profiler_inst->OpDataProducerEnd();
if (profiler_inst->GetSyncEnableFlag()) {
CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed.");
}
}
} else {
LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
}

@ -180,6 +180,7 @@ static const char *GetSubModuleName(SubModuleId module_id) {
"SESSION", // SM_SESSION
"UTILS", // SM_UTILS
"VM", // SM_VM
"PROFILER" // SM_PROFILER
};
return sub_module_names[module_id % NUM_SUBMODUES];

@ -123,6 +123,7 @@ enum SubModuleId : int {
SM_SESSION, // session
SM_UTILS, // utils
SM_VM, // VM
SM_PROFILER, // profiler
NUM_SUBMODUES // number of submodules
};

Loading…
Cancel
Save