|
|
|
@ -26,8 +26,14 @@ limitations under the License. */
|
|
|
|
|
namespace paddle {
|
|
|
|
|
namespace platform {
|
|
|
|
|
namespace {
|
|
|
|
|
// Current thread's id. Note, we don't distinguish nested threads
|
|
|
|
|
// for now.
|
|
|
|
|
thread_local int cur_thread_id = 0;
|
|
|
|
|
// Tracking the nested block stacks of each thread.
|
|
|
|
|
thread_local std::deque<int> block_id_stack;
|
|
|
|
|
// Tracking the nested event stacks.
|
|
|
|
|
thread_local std::deque<std::string> annotation_stack;
|
|
|
|
|
|
|
|
|
|
thread_local const char *cur_annotation = nullptr;
|
|
|
|
|
std::once_flag tracer_once_flag;
|
|
|
|
|
DeviceTracer *tracer = nullptr;
|
|
|
|
|
} // namespace
|
|
|
|
@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
correlations_[id] = anno;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
|
|
|
|
|
if (!anno) {
|
|
|
|
|
// TODO(panyx0718): Currently, it doesn't support nested situation
|
|
|
|
|
// Up-level can be cleared by low-level and therefore get nullptr
|
|
|
|
|
// here.
|
|
|
|
|
void AddCPURecords(const std::string &anno, uint64_t start_ns,
|
|
|
|
|
uint64_t end_ns, int64_t device_id, int64_t thread_id) {
|
|
|
|
|
if (anno.empty()) {
|
|
|
|
|
VLOG(1) << "Empty timeline annotation.";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
|
|
|
cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0});
|
|
|
|
|
cpu_records_.push_back(
|
|
|
|
|
CPURecord{anno, start_ns, end_ns, device_id, thread_id});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void AddMemRecords(const std::string &name, uint64_t start_ns,
|
|
|
|
|
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
|
|
|
|
|
uint64_t end_ns, int64_t device_id, int64_t stream_id,
|
|
|
|
|
uint32_t correlation_id, uint64_t bytes) {
|
|
|
|
|
// 0 means timestamp information could not be collected for the kernel.
|
|
|
|
|
if (start_ns == 0 || end_ns == 0) {
|
|
|
|
@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
stream_id, correlation_id, bytes});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
|
|
|
|
|
uint32_t stream_id, uint32_t correlation_id) {
|
|
|
|
|
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
|
|
|
|
|
int64_t stream_id, uint32_t correlation_id) {
|
|
|
|
|
// 0 means timestamp information could not be collected for the kernel.
|
|
|
|
|
if (start == 0 || end == 0) {
|
|
|
|
|
VLOG(3) << correlation_id << " cannot be traced";
|
|
|
|
@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
auto *event = profile_pb.add_events();
|
|
|
|
|
event->set_type(proto::Event::GPUKernel);
|
|
|
|
|
event->set_name(correlations_.at(r.correlation_id));
|
|
|
|
|
event->set_start_ns(r.start_ns);
|
|
|
|
|
event->set_end_ns(r.end_ns);
|
|
|
|
|
event->set_stream_id(r.stream_id);
|
|
|
|
|
event->set_sub_device_id(r.stream_id);
|
|
|
|
|
event->set_device_id(r.device_id);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (const CPURecord &r : cpu_records_) {
|
|
|
|
|
auto *event = profile_pb.add_events();
|
|
|
|
|
event->set_type(proto::Event::CPU);
|
|
|
|
|
event->set_name(r.name);
|
|
|
|
|
event->set_start_ns(r.start_ns);
|
|
|
|
|
event->set_end_ns(r.end_ns);
|
|
|
|
|
event->set_stream_id(r.thread_id);
|
|
|
|
|
event->set_device_id(-1);
|
|
|
|
|
event->set_sub_device_id(r.thread_id);
|
|
|
|
|
event->set_device_id(r.device_id);
|
|
|
|
|
}
|
|
|
|
|
for (const MemRecord &r : mem_records_) {
|
|
|
|
|
auto *event = profile_pb.add_events();
|
|
|
|
|
event->set_type(proto::Event::GPUKernel);
|
|
|
|
|
event->set_name(r.name);
|
|
|
|
|
event->set_start_ns(r.start_ns);
|
|
|
|
|
event->set_end_ns(r.end_ns);
|
|
|
|
|
event->set_stream_id(r.stream_id);
|
|
|
|
|
event->set_sub_device_id(r.stream_id);
|
|
|
|
|
event->set_device_id(r.device_id);
|
|
|
|
|
event->mutable_memcopy()->set_bytes(r.bytes);
|
|
|
|
|
}
|
|
|
|
@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer {
|
|
|
|
|
if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
|
|
|
|
|
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
|
|
|
|
|
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
|
|
|
|
|
const std::string anno =
|
|
|
|
|
cur_annotation ? cur_annotation : cbInfo->symbolName;
|
|
|
|
|
const std::string anno = !annotation_stack.empty()
|
|
|
|
|
? annotation_stack.back()
|
|
|
|
|
: cbInfo->symbolName;
|
|
|
|
|
tracer->AddAnnotation(cbInfo->correlationId, anno);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer {
|
|
|
|
|
|
|
|
|
|
void AddAnnotation(uint64_t id, const std::string &anno) {}
|
|
|
|
|
|
|
|
|
|
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
|
|
|
|
|
void AddCPURecords(const std::string &anno, uint64_t start_ns,
|
|
|
|
|
uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
|
|
|
|
|
|
|
|
|
|
void AddMemRecords(const std::string &name, uint64_t start_ns,
|
|
|
|
|
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
|
|
|
|
|
uint64_t end_ns, int64_t device_id, int64_t stream_id,
|
|
|
|
|
uint32_t correlation_id, uint64_t bytes) {}
|
|
|
|
|
|
|
|
|
|
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
|
|
|
|
|
uint32_t stream_id, uint32_t correlation_id) {}
|
|
|
|
|
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
|
|
|
|
|
int64_t stream_id, uint32_t correlation_id) {}
|
|
|
|
|
|
|
|
|
|
bool IsEnabled() { return false; }
|
|
|
|
|
|
|
|
|
@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() {
|
|
|
|
|
return tracer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
|
|
|
|
|
void SetCurAnnotation(const std::string &anno) {
|
|
|
|
|
annotation_stack.push_back(anno);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ClearCurAnnotation() { annotation_stack.pop_back(); }
|
|
|
|
|
|
|
|
|
|
std::string CurAnnotation() {
|
|
|
|
|
if (annotation_stack.empty()) return "";
|
|
|
|
|
return annotation_stack.back();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
|
|
|
|
|
|
|
|
|
|
void ClearCurBlock() { block_id_stack.pop_back(); }
|
|
|
|
|
|
|
|
|
|
int BlockDepth() { return block_id_stack.size(); }
|
|
|
|
|
|
|
|
|
|
void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
|
|
|
|
|
|
|
|
|
|
void ClearCurAnnotation() { cur_annotation = nullptr; }
|
|
|
|
|
void ClearCurThread() { cur_thread_id = 0; }
|
|
|
|
|
|
|
|
|
|
const char *CurAnnotation() { return cur_annotation; }
|
|
|
|
|
int CurThread() { return cur_thread_id; }
|
|
|
|
|
|
|
|
|
|
} // namespace platform
|
|
|
|
|
} // namespace paddle
|
|
|
|
|