|
|
|
@ -394,6 +394,38 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void PrintOverHead(const OverHead &overhead, const size_t data_width) {
|
|
|
|
|
double compute_time = overhead.total_time * overhead.compute_ratio;
|
|
|
|
|
double framework_time = overhead.total_time * overhead.framework_ratio;
|
|
|
|
|
std::cout.setf(std::ios::left);
|
|
|
|
|
std::cout << "Total time: " << overhead.total_time << std::endl;
|
|
|
|
|
std::cout << std::setw(25) << " Computation time"
|
|
|
|
|
<< "Total: " << std::setw(data_width) << compute_time
|
|
|
|
|
<< "Ratio: " << overhead.compute_ratio * 100 << "%" << std::endl;
|
|
|
|
|
std::cout << std::setw(25) << " Framework overhead"
|
|
|
|
|
<< "Total: " << std::setw(data_width) << framework_time
|
|
|
|
|
<< "Ratio: " << overhead.framework_ratio * 100 << "%" << std::endl;
|
|
|
|
|
|
|
|
|
|
std::cout << "\n-------------------------"
|
|
|
|
|
<< " GpuMemCpy Summary "
|
|
|
|
|
<< "-------------------------\n\n";
|
|
|
|
|
std::cout << std::setw(25) << "GpuMemcpy"
|
|
|
|
|
<< "Calls: " << std::setw(data_width) << overhead.memcpy_item.calls
|
|
|
|
|
<< "Total: " << std::setw(data_width)
|
|
|
|
|
<< overhead.memcpy_item.total_time
|
|
|
|
|
<< "Ratio: " << overhead.memcpy_item.ratio * 100 << "%"
|
|
|
|
|
<< std::endl;
|
|
|
|
|
for (size_t i = 0; i < overhead.sub_memcpy_items.size(); ++i) {
|
|
|
|
|
EventItem item = overhead.sub_memcpy_items[i];
|
|
|
|
|
if (item.calls != 0) {
|
|
|
|
|
std::cout << std::setw(25) << " " + item.name
|
|
|
|
|
<< "Calls: " << std::setw(data_width) << item.calls
|
|
|
|
|
<< "Total: " << std::setw(data_width) << item.total_time
|
|
|
|
|
<< "Ratio: " << item.ratio * 100 << "%" << std::endl;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Print results
|
|
|
|
|
void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
|
|
|
|
|
const std::multimap<std::string, EventItem> &child_map,
|
|
|
|
@ -428,38 +460,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
|
|
|
|
|
<< " in descending order in the same thread\n\n";
|
|
|
|
|
|
|
|
|
|
if (overhead.print) {
|
|
|
|
|
double compute_time = overhead.total_time * overhead.compute_ratio;
|
|
|
|
|
double framework_time = overhead.total_time * overhead.framework_ratio;
|
|
|
|
|
std::cout.setf(std::ios::left);
|
|
|
|
|
std::cout << "Total time: " << overhead.total_time << std::endl;
|
|
|
|
|
std::cout << std::setw(25) << " Computation time"
|
|
|
|
|
<< "Total: " << std::setw(data_width) << compute_time
|
|
|
|
|
<< "Ratio: " << overhead.compute_ratio * 100 << "%"
|
|
|
|
|
<< std::endl;
|
|
|
|
|
std::cout << std::setw(25) << " Framework overhead"
|
|
|
|
|
<< "Total: " << std::setw(data_width) << framework_time
|
|
|
|
|
<< "Ratio: " << overhead.framework_ratio * 100 << "%"
|
|
|
|
|
<< std::endl;
|
|
|
|
|
|
|
|
|
|
std::cout << "\n-------------------------"
|
|
|
|
|
<< " GpuMemCpy Summary "
|
|
|
|
|
<< "-------------------------\n\n";
|
|
|
|
|
std::cout << std::setw(25) << "GpuMemcpy"
|
|
|
|
|
<< "Calls: " << std::setw(data_width)
|
|
|
|
|
<< overhead.memcpy_item.calls
|
|
|
|
|
<< "Total: " << std::setw(data_width)
|
|
|
|
|
<< overhead.memcpy_item.total_time
|
|
|
|
|
<< "Ratio: " << overhead.memcpy_item.ratio * 100 << "%"
|
|
|
|
|
<< std::endl;
|
|
|
|
|
for (size_t i = 0; i < overhead.sub_memcpy_items.size(); ++i) {
|
|
|
|
|
EventItem item = overhead.sub_memcpy_items[i];
|
|
|
|
|
if (item.calls != 0) {
|
|
|
|
|
std::cout << std::setw(25) << " " + item.name
|
|
|
|
|
<< "Calls: " << std::setw(data_width) << item.calls
|
|
|
|
|
<< "Total: " << std::setw(data_width) << item.total_time
|
|
|
|
|
<< "Ratio: " << item.ratio * 100 << "%" << std::endl;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
PrintOverHead(overhead, data_width);
|
|
|
|
|
}
|
|
|
|
|
std::cout << "\n-------------------------"
|
|
|
|
|
<< " Event Summary "
|
|
|
|
@ -522,37 +523,13 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse the event list and output the profiling report
|
|
|
|
|
void ParseEvents(const std::vector<std::vector<Event>> &events,
|
|
|
|
|
bool merge_thread,
|
|
|
|
|
EventSortingKey sorted_by = EventSortingKey::kDefault) {
|
|
|
|
|
if (g_state == ProfilerState::kDisabled) return;
|
|
|
|
|
if (merge_thread && events.size() < 2) return;
|
|
|
|
|
|
|
|
|
|
std::string sorted_domain;
|
|
|
|
|
std::function<bool(const EventItem &, const EventItem &)> sorted_func;
|
|
|
|
|
sorted_func = SetSortedFunc(sorted_by, &sorted_domain);
|
|
|
|
|
|
|
|
|
|
const std::vector<std::vector<Event>> *analyze_events;
|
|
|
|
|
std::vector<std::vector<Event>> merged_events_list;
|
|
|
|
|
if (merge_thread) {
|
|
|
|
|
std::vector<Event> merged_events;
|
|
|
|
|
for (size_t i = 0; i < events.size(); ++i) {
|
|
|
|
|
for (size_t j = 0; j < events[i].size(); ++j) {
|
|
|
|
|
merged_events.push_back(events[i][j]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
merged_events_list.push_back(merged_events);
|
|
|
|
|
analyze_events = &merged_events_list;
|
|
|
|
|
} else {
|
|
|
|
|
analyze_events = &events;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<EventItem>> events_table;
|
|
|
|
|
std::multimap<std::string, EventItem> child_map;
|
|
|
|
|
size_t max_name_width = 0;
|
|
|
|
|
OverHead overhead;
|
|
|
|
|
|
|
|
|
|
void AnalyzeEvent(
|
|
|
|
|
const std::vector<std::vector<Event>> *analyze_events,
|
|
|
|
|
std::vector<std::vector<EventItem>> *events_table,
|
|
|
|
|
std::multimap<std::string, EventItem> *child_map,
|
|
|
|
|
std::function<bool(const EventItem &, const EventItem &)> sorted_func,
|
|
|
|
|
EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead,
|
|
|
|
|
bool merge_thread) {
|
|
|
|
|
for (size_t i = 0; i < (*analyze_events).size(); i++) {
|
|
|
|
|
double total = 0.; // the total time in one thread
|
|
|
|
|
std::list<Event> pushed_events;
|
|
|
|
@ -563,7 +540,7 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
|
|
|
|
|
|
|
|
|
|
for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
|
|
|
|
|
Event analyze_event = (*analyze_events)[i][j];
|
|
|
|
|
SetEvent(merge_thread, analyze_event, &max_name_width, &pushed_events,
|
|
|
|
|
SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events,
|
|
|
|
|
&event_items, &event_idx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -593,7 +570,6 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
|
|
|
|
|
total += event_items[j].total_time;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// average time
|
|
|
|
|
for (auto &item : main_event_items) {
|
|
|
|
|
item.ave_time = item.total_time / item.calls;
|
|
|
|
@ -603,20 +579,18 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
|
|
|
|
|
it->second.ratio = it->second.total_time / total;
|
|
|
|
|
it->second.ave_time = it->second.total_time / it->second.calls;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// When multi-threaded, overhead are printed only if merge_thread is true
|
|
|
|
|
if ((*analyze_events).size() == 1) {
|
|
|
|
|
overhead.total_time = total;
|
|
|
|
|
overhead.print = true;
|
|
|
|
|
ComputeOverhead(sub_child_map, &overhead);
|
|
|
|
|
overhead->total_time = total;
|
|
|
|
|
overhead->print = true;
|
|
|
|
|
ComputeOverhead(sub_child_map, overhead);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// sort
|
|
|
|
|
if (sorted_by != EventSortingKey::kDefault) {
|
|
|
|
|
std::sort(main_event_items.begin(), main_event_items.end(), sorted_func);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
events_table.push_back(main_event_items);
|
|
|
|
|
events_table->push_back(main_event_items);
|
|
|
|
|
// log warning if there are events with `push` but without `pop`
|
|
|
|
|
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
|
|
|
|
|
while (rit != pushed_events.rend()) {
|
|
|
|
@ -625,9 +599,43 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
|
|
|
|
|
++rit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
GetChildMap(sub_child_map, &child_map);
|
|
|
|
|
GetChildMap(sub_child_map, child_map);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Parse the event list and output the profiling report
|
|
|
|
|
void ParseEvents(const std::vector<std::vector<Event>> &events,
|
|
|
|
|
bool merge_thread,
|
|
|
|
|
EventSortingKey sorted_by = EventSortingKey::kDefault) {
|
|
|
|
|
if (g_state == ProfilerState::kDisabled) return;
|
|
|
|
|
if (merge_thread && events.size() < 2) return;
|
|
|
|
|
|
|
|
|
|
std::string sorted_domain;
|
|
|
|
|
std::function<bool(const EventItem &, const EventItem &)> sorted_func;
|
|
|
|
|
sorted_func = SetSortedFunc(sorted_by, &sorted_domain);
|
|
|
|
|
|
|
|
|
|
const std::vector<std::vector<Event>> *analyze_events;
|
|
|
|
|
std::vector<std::vector<Event>> merged_events_list;
|
|
|
|
|
if (merge_thread) {
|
|
|
|
|
std::vector<Event> merged_events;
|
|
|
|
|
for (size_t i = 0; i < events.size(); ++i) {
|
|
|
|
|
for (size_t j = 0; j < events[i].size(); ++j) {
|
|
|
|
|
merged_events.push_back(events[i][j]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
merged_events_list.push_back(merged_events);
|
|
|
|
|
analyze_events = &merged_events_list;
|
|
|
|
|
} else {
|
|
|
|
|
analyze_events = &events;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<EventItem>> events_table;
|
|
|
|
|
std::multimap<std::string, EventItem> child_map;
|
|
|
|
|
size_t max_name_width = 0;
|
|
|
|
|
OverHead overhead;
|
|
|
|
|
|
|
|
|
|
AnalyzeEvent(analyze_events, &events_table, &child_map, sorted_func,
|
|
|
|
|
sorted_by, &max_name_width, &overhead, merge_thread);
|
|
|
|
|
|
|
|
|
|
// Print report
|
|
|
|
|
PrintProfiler(events_table, child_map, overhead, sorted_domain,
|
|
|
|
|
max_name_width + 8, 12, merge_thread, 0, 0);
|
|
|
|
|