|
|
|
@ -55,6 +55,36 @@ uint64_t kAlignSize = 8;
|
|
|
|
|
} \
|
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
|
|
|
|
|
switch (kind) {
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
|
|
|
|
|
return "MEMCPY_HtoD";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
|
|
|
|
|
return "MEMCPY_DtoH";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
|
|
|
|
|
return "MEMCPY_HtoA";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
|
|
|
|
|
return "MEMCPY_AtoH";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
|
|
|
|
|
return "MEMCPY_AtoA";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
|
|
|
|
|
return "MEMCPY_AtoD";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
|
|
|
|
|
return "MEMCPY_DtoA";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
|
|
|
|
|
return "MEMCPY_DtoD";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
|
|
|
|
|
return "MEMCPY_HtoH";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
|
|
|
|
|
return "MEMCPY_PtoP";
|
|
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
|
|
|
|
|
return "MEMCPY_FORCE_INT";
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
return "MEMCPY";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void EnableActivity() {
|
|
|
|
|
// Device activity record is created when CUDA initializes, so we
|
|
|
|
|
// want to enable it before cuInit() or any CUDA runtime call.
|
|
|
|
@ -111,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
|
|
|
|
|
kernel->correlationId);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case CUPTI_ACTIVITY_KIND_MEMCPY: {
|
|
|
|
|
auto *memcpy =
|
|
|
|
|
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
|
|
|
|
|
tracer->AddMemRecords(
|
|
|
|
|
MemcpyKind(
|
|
|
|
|
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
|
|
|
|
|
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
|
|
|
|
|
memcpy->correlationId, memcpy->bytes);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
|
|
|
|
|
auto *memcpy =
|
|
|
|
|
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
|
|
|
|
|
tracer->AddMemRecords(
|
|
|
|
|
MemcpyKind(
|
|
|
|
|
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
|
|
|
|
|
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
|
|
|
|
|
memcpy->correlationId, memcpy->bytes);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default: { break; }
|
|
|
|
|
}
|
|
|
|
|
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
|
|
|
|
@ -148,6 +198,13 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
std::hash<std::thread::id>{}(std::this_thread::get_id())});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void AddMemRecords(const std::string &name, uint64_t start_ns,
|
|
|
|
|
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
|
|
|
|
|
uint32_t correlation_id, uint64_t bytes) {
|
|
|
|
|
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
|
|
|
|
|
stream_id, correlation_id, bytes});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
|
|
|
|
|
uint32_t stream_id, uint32_t correlation_id) {
|
|
|
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
|
|
@ -183,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
CUPTI_CALL(
|
|
|
|
|
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
|
|
|
|
|
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
|
|
|
|
|
|
|
|
|
|
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
|
|
|
|
|
enabled_ = true;
|
|
|
|
|
}
|
|
|
|
@ -214,6 +270,15 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
event->set_stream_id(r.thread_id);
|
|
|
|
|
event->set_device_id(-1);
|
|
|
|
|
}
|
|
|
|
|
for (const MemRecord &r : mem_records_) {
|
|
|
|
|
auto *event = profile_pb.add_events();
|
|
|
|
|
event->set_name(r.name);
|
|
|
|
|
event->set_start_ns(r.start_ns);
|
|
|
|
|
event->set_end_ns(r.end_ns);
|
|
|
|
|
event->set_stream_id(r.stream_id);
|
|
|
|
|
event->set_device_id(r.device_id);
|
|
|
|
|
event->mutable_memcopy()->set_bytes(r.bytes);
|
|
|
|
|
}
|
|
|
|
|
std::string profile_str;
|
|
|
|
|
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
|
|
|
|
|
std::ofstream profile_f;
|
|
|
|
@ -257,6 +322,7 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
uint64_t start_ns_;
|
|
|
|
|
uint64_t end_ns_;
|
|
|
|
|
std::vector<KernelRecord> kernel_records_;
|
|
|
|
|
std::vector<MemRecord> mem_records_;
|
|
|
|
|
std::vector<CPURecord> cpu_records_;
|
|
|
|
|
std::unordered_map<uint32_t, std::string> correlations_;
|
|
|
|
|
CUpti_SubscriberHandle subscriber_;
|
|
|
|
@ -272,6 +338,10 @@ class DeviceTracerDummy : public DeviceTracer {
|
|
|
|
|
|
|
|
|
|
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
|
|
|
|
|
|
|
|
|
|
void AddMemRecords(const std::string &name, uint64_t start_ns,
|
|
|
|
|
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
|
|
|
|
|
uint32_t correlation_id, uint64_t bytes) {}
|
|
|
|
|
|
|
|
|
|
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
|
|
|
|
|
uint32_t stream_id, uint32_t correlation_id) {}
|
|
|
|
|
|
|
|
|
|