You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
724 lines
27 KiB
724 lines
27 KiB
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include <deque>
|
|
#include <forward_list>
|
|
#include <fstream>
|
|
#include <list>
|
|
#include <map>
|
|
#include <mutex> // NOLINT
|
|
#include <numeric>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <thread> // NOLINT
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "glog/logging.h"
|
|
#include "google/protobuf/text_format.h"
|
|
#include "paddle/fluid/framework/block_desc.h"
|
|
#include "paddle/fluid/platform/device_tracer.h"
|
|
#include "paddle/fluid/platform/profiler.h"
|
|
#include "paddle/fluid/string/printf.h"
|
|
|
|
namespace paddle {
|
|
namespace platform {
|
|
namespace {
|
|
// Tracking the nested block stacks of each thread.
|
|
thread_local std::deque<int> block_id_stack;
|
|
// Tracking the nested event stacks.
|
|
thread_local std::deque<Event *> annotation_stack;
|
|
|
|
std::map<uint32_t, int32_t> system_thread_id_map;
|
|
|
|
std::once_flag tracer_once_flag;
|
|
DeviceTracer *tracer = nullptr;
|
|
|
|
void PrintCuptiHint() {
|
|
static bool showed = false;
|
|
if (showed) return;
|
|
showed = true;
|
|
LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
|
|
"FLAGS_multiple_of_cupti_buffer_size.";
|
|
}
|
|
|
|
} // namespace
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
|
|
namespace {
|
|
// The experimental best performance is
|
|
// the same size with CUPTI device buffer size(8M)
|
|
uint64_t kBufSize = 1024 * 1024 * 8;
|
|
uint64_t kAlignSize = 8;
|
|
std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
|
|
driver_cbid_str;
|
|
|
|
#define ALIGN_BUFFER(buffer, align) \
|
|
(((uintptr_t)(buffer) & ((align)-1)) \
|
|
? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
|
|
: (buffer))
|
|
|
|
#define CUPTI_CALL(call) \
|
|
do { \
|
|
CUptiResult _status = call; \
|
|
if (_status != CUPTI_SUCCESS) { \
|
|
const char *errstr; \
|
|
dynload::cuptiGetResultString(_status, &errstr); \
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
|
|
__FILE__, __LINE__, #call, errstr); \
|
|
exit(-1); \
|
|
} \
|
|
} while (0)
|
|
|
|
std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
|
|
switch (kind) {
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
|
|
return "MEMCPY_HtoD";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
|
|
return "MEMCPY_DtoH";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
|
|
return "MEMCPY_HtoA";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
|
|
return "MEMCPY_AtoH";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
|
|
return "MEMCPY_AtoA";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
|
|
return "MEMCPY_AtoD";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
|
|
return "MEMCPY_DtoA";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
|
|
return "MEMCPY_DtoD";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
|
|
return "MEMCPY_HtoH";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
|
|
return "MEMCPY_PtoP";
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
|
|
return "MEMCPY_FORCE_INT";
|
|
default:
|
|
break;
|
|
}
|
|
return "MEMCPY";
|
|
}
|
|
|
|
std::string DriverKind(CUpti_CallbackId cbid) {
|
|
auto iter = driver_cbid_str.find(cbid);
|
|
if (iter == driver_cbid_str.end())
|
|
return "Driver API " + std::to_string(cbid);
|
|
return iter->second;
|
|
}
|
|
|
|
std::string RuntimeKind(CUpti_CallbackId cbid) {
|
|
auto iter = runtime_cbid_str.find(cbid);
|
|
if (iter == runtime_cbid_str.end())
|
|
return "Runtime API " + std::to_string(cbid);
|
|
return iter->second;
|
|
}
|
|
|
|
void EnableActivity() {
|
|
// Device activity record is created when CUDA initializes, so we
|
|
// want to enable it before cuInit() or any CUDA runtime call.
|
|
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
|
|
CUPTI_CALL(
|
|
dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
|
|
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
|
|
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
|
|
// We don't track these activities for now.
|
|
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
|
|
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
|
|
}
|
|
|
|
void DisableActivity() {
|
|
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
|
|
CUPTI_CALL(
|
|
dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
|
|
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
|
|
// Disable all other activity record kinds.
|
|
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
|
|
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
|
|
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
|
|
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
|
|
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
|
|
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
|
|
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
|
|
}
|
|
|
|
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
|
|
size_t *maxNumRecords) {
|
|
uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
|
|
*size = kBufSize;
|
|
*buffer = ALIGN_BUFFER(buf, kAlignSize);
|
|
*maxNumRecords = 0;
|
|
}
|
|
|
|
void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
|
|
size_t size, size_t validSize) {
|
|
static std::thread::id cupti_thread_id(0);
|
|
if (cupti_thread_id == std::thread::id(0))
|
|
cupti_thread_id = std::this_thread::get_id();
|
|
PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
|
|
"Only one thread is allowed to call bufferCompleted()");
|
|
CUptiResult status;
|
|
CUpti_Activity *record = NULL;
|
|
if (validSize > 0) {
|
|
do {
|
|
status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record);
|
|
if (status == CUPTI_SUCCESS) {
|
|
switch (record->kind) {
|
|
case CUPTI_ACTIVITY_KIND_KERNEL:
|
|
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
|
|
auto *kernel =
|
|
reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
|
|
tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end,
|
|
kernel->deviceId, kernel->streamId,
|
|
kernel->correlationId);
|
|
break;
|
|
}
|
|
case CUPTI_ACTIVITY_KIND_MEMCPY: {
|
|
auto *memcpy =
|
|
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
|
|
tracer->AddMemRecords(
|
|
MemcpyKind(
|
|
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
|
|
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
|
|
memcpy->correlationId, memcpy->bytes);
|
|
break;
|
|
}
|
|
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
|
|
auto *memcpy =
|
|
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
|
|
tracer->AddMemRecords(
|
|
MemcpyKind(
|
|
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
|
|
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
|
|
memcpy->correlationId, memcpy->bytes);
|
|
break;
|
|
}
|
|
case CUPTI_ACTIVITY_KIND_MEMSET: {
|
|
auto *memset =
|
|
reinterpret_cast<const CUpti_ActivityMemset *>(record);
|
|
tracer->AddKernelRecords("MEMSET", memset->start, memset->end,
|
|
memset->deviceId, memset->streamId,
|
|
memset->correlationId);
|
|
break;
|
|
}
|
|
case CUPTI_ACTIVITY_KIND_DRIVER: {
|
|
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
|
|
if (api->start != 0 && api->end != 0) {
|
|
// -1 device id represents ActiveKind api call
|
|
tracer->AddActiveKindRecords(
|
|
DriverKind(api->cbid), api->start, api->end, -1,
|
|
GetThreadIdFromSystemThreadId(api->threadId),
|
|
api->correlationId);
|
|
}
|
|
break;
|
|
}
|
|
case CUPTI_ACTIVITY_KIND_RUNTIME: {
|
|
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
|
|
if (api->start != 0 && api->end != 0) {
|
|
// -1 device id represents ActiveKind api call
|
|
tracer->AddActiveKindRecords(
|
|
RuntimeKind(api->cbid), api->start, api->end, -1,
|
|
GetThreadIdFromSystemThreadId(api->threadId),
|
|
api->correlationId);
|
|
}
|
|
break;
|
|
}
|
|
default: { break; }
|
|
}
|
|
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
|
|
// Seems not an error in this case.
|
|
break;
|
|
} else {
|
|
CUPTI_CALL(status);
|
|
}
|
|
} while (1);
|
|
|
|
size_t dropped;
|
|
CUPTI_CALL(
|
|
dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
|
|
if (dropped != 0) {
|
|
fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
|
|
PrintCuptiHint();
|
|
}
|
|
}
|
|
free(buffer);
|
|
}
|
|
|
|
void initCuptiCbidStr();
|
|
|
|
} // namespace
|
|
|
|
#endif // PADDLE_WITH_CUPTI
|
|
|
|
class DeviceTracerImpl : public DeviceTracer {
|
|
public:
|
|
DeviceTracerImpl() : enabled_(false) {
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
initCuptiCbidStr();
|
|
#endif
|
|
}
|
|
|
|
void AddAnnotation(uint32_t id, Event *event) {
|
|
thread_local std::forward_list<std::pair<uint32_t, Event *>>
|
|
*local_correlations_pairs = nullptr;
|
|
if (local_correlations_pairs == nullptr) {
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
correlations_pairs.emplace_front();
|
|
local_correlations_pairs = &correlations_pairs.front();
|
|
}
|
|
local_correlations_pairs->push_front(std::make_pair(id, event));
|
|
}
|
|
|
|
void AddCPURecords(const std::string &anno, uint64_t start_ns,
|
|
uint64_t end_ns, int64_t device_id, int64_t thread_id) {
|
|
if (anno.empty()) {
|
|
VLOG(1) << "Empty timeline annotation.";
|
|
return;
|
|
}
|
|
thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
|
|
if (local_cpu_records_ == nullptr) {
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
cpu_records_.emplace_front();
|
|
local_cpu_records_ = &cpu_records_.front();
|
|
}
|
|
local_cpu_records_->push_front(
|
|
CPURecord{anno, start_ns, end_ns, device_id, thread_id});
|
|
}
|
|
|
|
void AddMemRecords(const std::string &name, uint64_t start_ns,
|
|
uint64_t end_ns, int64_t device_id, int64_t stream_id,
|
|
uint32_t correlation_id, uint64_t bytes) {
|
|
// 0 means timestamp information could not be collected for the kernel.
|
|
if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
|
|
VLOG(3) << name << " cannot be traced";
|
|
PrintCuptiHint();
|
|
return;
|
|
}
|
|
// NOTE(liangdun): lock is not needed, only one thread call this function.
|
|
mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id,
|
|
stream_id, correlation_id, bytes});
|
|
}
|
|
|
|
void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
|
|
const Place &place, const std::string &alloc_in,
|
|
const std::string &free_in, int64_t thread_id) {
|
|
if (0 == start_ns || 0 == end_ns) {
|
|
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
|
|
return;
|
|
}
|
|
thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
|
|
nullptr;
|
|
if (local_mem_info_record == nullptr) {
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
mem_info_record_.emplace_front();
|
|
local_mem_info_record = &mem_info_record_.front();
|
|
}
|
|
local_mem_info_record->emplace_front(MemInfoRecord{
|
|
start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
|
|
}
|
|
|
|
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
|
|
uint64_t end_ns, int64_t device_id,
|
|
int64_t thread_id, uint32_t correlation_id) {
|
|
if (anno.empty()) {
|
|
VLOG(1) << "Empty timeline annotation.";
|
|
return;
|
|
}
|
|
thread_local std::forward_list<ActiveKindRecord>
|
|
*local_active_kind_records = nullptr;
|
|
if (local_active_kind_records == nullptr) {
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
active_kind_records_.emplace_front();
|
|
local_active_kind_records = &active_kind_records_.front();
|
|
}
|
|
// lock is not needed, only one thread call this function.
|
|
local_active_kind_records->push_front(ActiveKindRecord{
|
|
anno, start_ns, end_ns, device_id, thread_id, correlation_id});
|
|
}
|
|
|
|
void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
|
|
int64_t device_id, int64_t stream_id,
|
|
uint32_t correlation_id) {
|
|
// 0 means timestamp information could not be collected for the kernel.
|
|
if (start == 0 || end == 0 || start == end) {
|
|
VLOG(3) << correlation_id << " cannot be traced";
|
|
PrintCuptiHint();
|
|
return;
|
|
}
|
|
// NOTE(liangdun): lock is not needed, only one thread call this function.
|
|
kernel_records_.push_front(
|
|
KernelRecord{name, start, end, device_id, stream_id, correlation_id});
|
|
}
|
|
|
|
bool IsEnabled() {
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
return enabled_;
|
|
}
|
|
|
|
void Enable() {
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
if (enabled_) {
|
|
return;
|
|
}
|
|
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
EnableActivity();
|
|
|
|
// Register callbacks for buffer requests and completed by CUPTI.
|
|
CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested,
|
|
bufferCompleted));
|
|
|
|
CUptiResult ret;
|
|
ret = dynload::cuptiSubscribe(
|
|
&subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
|
|
if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
|
|
fprintf(stderr, "CUPTI subcriber limit reached.\n");
|
|
} else if (ret != CUPTI_SUCCESS) {
|
|
fprintf(stderr, "Failed to create CUPTI subscriber.\n");
|
|
}
|
|
const std::vector<int> cbids {
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
|
|
#if CUDA_VERSION >= 9000
|
|
,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
|
|
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
|
|
#endif
|
|
};
|
|
for (auto cbid : cbids)
|
|
CUPTI_CALL(dynload::cuptiEnableCallback(
|
|
1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
|
|
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
|
|
#endif // PADDLE_WITH_CUPTI
|
|
enabled_ = true;
|
|
}
|
|
|
|
void Reset() {
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
CUPTI_CALL(
|
|
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
|
|
#endif
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
kernel_records_.clear();
|
|
mem_records_.clear();
|
|
correlations_.clear();
|
|
for (auto &tmp : correlations_pairs) tmp.clear();
|
|
for (auto &tmp : cpu_records_) tmp.clear();
|
|
for (auto &tmp : mem_info_record_) tmp.clear();
|
|
for (auto &tmp : active_kind_records_) tmp.clear();
|
|
}
|
|
|
|
void GenEventKernelCudaElapsedTime() {
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
if (correlations_.empty())
|
|
for (auto &tmp : correlations_pairs)
|
|
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
|
|
for (const KernelRecord &r : kernel_records_) {
|
|
auto c = correlations_.find(r.correlation_id);
|
|
if (c != correlations_.end() && c->second != nullptr) {
|
|
Event *e = c->second;
|
|
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
|
|
}
|
|
}
|
|
for (const auto &r : mem_records_) {
|
|
auto c = correlations_.find(r.correlation_id);
|
|
if (c != correlations_.end() && c->second != nullptr) {
|
|
Event *e = c->second;
|
|
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
proto::Profile GenProfile(const std::string &profile_path) {
|
|
int miss = 0, find = 0;
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
proto::Profile profile_pb;
|
|
profile_pb.set_start_ns(start_ns_);
|
|
profile_pb.set_end_ns(end_ns_);
|
|
if (correlations_.empty()) {
|
|
for (auto &tmp : correlations_pairs) {
|
|
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
|
|
}
|
|
}
|
|
|
|
for (const KernelRecord &r : kernel_records_) {
|
|
auto *event = profile_pb.add_events();
|
|
event->set_type(proto::Event::GPUKernel);
|
|
auto c = correlations_.find(r.correlation_id);
|
|
if (c != correlations_.end() && c->second != nullptr) {
|
|
event->set_name(c->second->name());
|
|
event->set_detail_info(r.name);
|
|
find++;
|
|
} else {
|
|
VLOG(10) << "Missing Kernel Event: " + r.name;
|
|
miss++;
|
|
event->set_name(r.name);
|
|
}
|
|
event->set_start_ns(r.start_ns);
|
|
event->set_end_ns(r.end_ns);
|
|
event->set_sub_device_id(r.stream_id);
|
|
event->set_device_id(r.device_id);
|
|
}
|
|
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
|
|
|
|
for (auto &tmp : cpu_records_) {
|
|
for (const CPURecord &r : tmp) {
|
|
auto *event = profile_pb.add_events();
|
|
event->set_type(proto::Event::CPU);
|
|
event->set_name(r.name);
|
|
event->set_start_ns(r.start_ns);
|
|
event->set_end_ns(r.end_ns);
|
|
event->set_sub_device_id(r.thread_id);
|
|
event->set_device_id(r.device_id);
|
|
}
|
|
}
|
|
|
|
for (auto &tmp : active_kind_records_) {
|
|
for (const ActiveKindRecord &r : tmp) {
|
|
auto *event = profile_pb.add_events();
|
|
event->set_type(proto::Event::CPU);
|
|
auto c = correlations_.find(r.correlation_id);
|
|
if (c != correlations_.end() && c->second != nullptr) {
|
|
event->set_name(c->second->name());
|
|
event->set_detail_info(r.name);
|
|
} else {
|
|
event->set_name(r.name);
|
|
}
|
|
event->set_start_ns(r.start_ns);
|
|
event->set_end_ns(r.end_ns);
|
|
event->set_sub_device_id(r.thread_id);
|
|
event->set_device_id(r.device_id);
|
|
}
|
|
}
|
|
miss = find = 0;
|
|
for (const MemRecord &r : mem_records_) {
|
|
auto *event = profile_pb.add_events();
|
|
event->set_type(proto::Event::GPUKernel);
|
|
auto c = correlations_.find(r.correlation_id);
|
|
if (c != correlations_.end() && c->second != nullptr) {
|
|
event->set_name(c->second->name());
|
|
event->set_detail_info(r.name);
|
|
find++;
|
|
} else {
|
|
miss++;
|
|
event->set_name(r.name);
|
|
}
|
|
event->set_start_ns(r.start_ns);
|
|
event->set_end_ns(r.end_ns);
|
|
event->set_sub_device_id(r.stream_id);
|
|
event->set_device_id(r.device_id);
|
|
event->mutable_memcopy()->set_bytes(r.bytes);
|
|
}
|
|
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
|
|
|
|
for (auto &tmp : mem_info_record_) {
|
|
for (const auto &r : tmp) {
|
|
auto *event = profile_pb.add_mem_events();
|
|
event->set_device_id(0);
|
|
if (platform::is_cpu_place(r.place)) {
|
|
event->set_place(proto::MemEvent::CPUPlace);
|
|
} else if (platform::is_gpu_place(r.place)) {
|
|
event->set_place(proto::MemEvent::CUDAPlace);
|
|
event->set_device_id(
|
|
boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
|
|
} else if (platform::is_cuda_pinned_place(r.place)) {
|
|
event->set_place(proto::MemEvent::CUDAPinnedPlace);
|
|
} else {
|
|
PADDLE_THROW("The current place is not supported.");
|
|
}
|
|
event->set_alloc_in(r.alloc_in);
|
|
event->set_free_in(r.free_in);
|
|
event->set_start_ns(r.start_ns);
|
|
event->set_end_ns(r.end_ns);
|
|
event->set_bytes(r.bytes);
|
|
event->set_thread_id(r.thread_id);
|
|
}
|
|
}
|
|
|
|
std::ofstream profile_f;
|
|
profile_f.open(profile_path,
|
|
std::ios::out | std::ios::trunc | std::ios::binary);
|
|
profile_pb.SerializeToOstream(&profile_f);
|
|
profile_f.close();
|
|
return profile_pb;
|
|
}
|
|
|
|
void Disable() {
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
// flush might cause additional calls to DeviceTracker.
|
|
CUPTI_CALL(
|
|
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
|
|
#endif // PADDLE_WITH_CUPTI
|
|
std::lock_guard<std::mutex> l(trace_mu_);
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
DisableActivity();
|
|
CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
|
|
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
|
|
#endif // PADDLE_WITH_CUPTI
|
|
enabled_ = false;
|
|
}
|
|
|
|
private:
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
|
|
CUpti_CallbackId cbid, const void *cbdata) {
|
|
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
|
|
DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
|
|
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
|
|
Event *event = CurAnnotation();
|
|
tracer->AddAnnotation(cbInfo->correlationId, event);
|
|
}
|
|
}
|
|
CUpti_SubscriberHandle subscriber_;
|
|
#endif // PADDLE_WITH_CUPTI
|
|
std::mutex trace_mu_;
|
|
bool enabled_;
|
|
uint64_t start_ns_;
|
|
uint64_t end_ns_;
|
|
std::forward_list<KernelRecord> kernel_records_;
|
|
std::forward_list<MemRecord> mem_records_;
|
|
std::forward_list<std::forward_list<CPURecord>> cpu_records_;
|
|
std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
|
|
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
|
|
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
|
|
correlations_pairs;
|
|
std::unordered_map<uint32_t, Event *> correlations_;
|
|
};
|
|
|
|
void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
|
|
|
|
DeviceTracer *GetDeviceTracer() {
|
|
std::call_once(tracer_once_flag, CreateTracer, &tracer);
|
|
return tracer;
|
|
}
|
|
|
|
void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
|
|
|
|
void ClearCurAnnotation() { annotation_stack.pop_back(); }
|
|
|
|
Event *CurAnnotation() {
|
|
if (annotation_stack.empty()) return nullptr;
|
|
return annotation_stack.back();
|
|
}
|
|
std::string CurAnnotationName() {
|
|
if (annotation_stack.empty()) return "Unknown";
|
|
return annotation_stack.back()->name();
|
|
}
|
|
|
|
void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
|
|
|
|
void ClearCurBlock() { block_id_stack.pop_back(); }
|
|
|
|
int BlockDepth() { return block_id_stack.size(); }
|
|
|
|
uint32_t GetCurSystemThreadId() {
|
|
std::stringstream ss;
|
|
ss << std::this_thread::get_id();
|
|
uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
|
|
return id;
|
|
}
|
|
|
|
void RecoreCurThreadId(int32_t id) {
|
|
auto gid = GetCurSystemThreadId();
|
|
VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
|
|
system_thread_id_map[gid] = id;
|
|
}
|
|
|
|
int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
|
|
auto it = system_thread_id_map.find(id);
|
|
if (it != system_thread_id_map.end()) return it->second;
|
|
// return origin id if no event is recorded in this thread.
|
|
return static_cast<int32_t>(id);
|
|
}
|
|
|
|
#ifdef PADDLE_WITH_CUPTI
|
|
namespace {
|
|
|
|
void initCuptiCbidStr() {
|
|
static bool called = false;
|
|
if (called) return;
|
|
called = true;
|
|
#define REGISTER_RUNTIME_CBID_STR(cbid) \
|
|
runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
|
|
|
|
REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
|
|
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
|
|
REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
|
|
REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(
|
|
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
|
|
REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
|
|
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
|
|
REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
|
|
REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
|
|
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
|
|
#if CUDA_VERSION >= 9000
|
|
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
|
|
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
|
|
#endif
|
|
|
|
#undef REGISTER_RUNTIME_CBID_STR
|
|
}
|
|
} // namespace
|
|
#endif // PADDLE_WITH_CUPTI
|
|
|
|
} // namespace platform
|
|
} // namespace paddle
|