commit
42a0603e6f
After Width: | Height: | Size: 50 KiB |
File diff suppressed because one or more lines are too long
@ -0,0 +1,173 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/platform/profiler.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace platform {
|
||||
|
||||
// The profiler state, the initial value is ProfilerState::kDisabled
|
||||
static ProfilerState g_state = ProfilerState::kDisabled;
|
||||
// The thread local event list only can be accessed by the specific thread
|
||||
// The thread index of each thread
|
||||
static thread_local int32_t g_thread_id;
|
||||
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
|
||||
// g_next_thread_id, we can know how many threads have created EventList.
|
||||
static uint32_t g_next_thread_id = 0;
|
||||
// The global mutex
|
||||
static std::mutex g_all_event_lists_mutex;
|
||||
// The total event lists of all threads
|
||||
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
|
||||
// The thread local event list only can be accessed by the specific thread
|
||||
static thread_local std::shared_ptr<EventList> g_event_list;
|
||||
|
||||
inline uint64_t GetTimeInNsec() {
|
||||
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
|
||||
std::chrono::high_resolution_clock,
|
||||
std::chrono::steady_clock>::type;
|
||||
return std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||
clock::now().time_since_epoch())
|
||||
.count();
|
||||
}
|
||||
|
||||
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
|
||||
DeviceContext* dev_ctx)
|
||||
: kind_(kind),
|
||||
name_(std::move(name)),
|
||||
thread_id_(thread_id),
|
||||
has_cuda_(false) {
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
|
||||
if (cuda_dev_ctx) {
|
||||
PADDLE_ENFORCE(cudaGetDevice(&device_));
|
||||
PADDLE_ENFORCE(cudaEventCreate(&event_));
|
||||
auto stream = cuda_dev_ctx->stream();
|
||||
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
|
||||
has_cuda_ = true;
|
||||
}
|
||||
#endif
|
||||
cpu_ns_ = GetTimeInNsec();
|
||||
}
|
||||
|
||||
std::string Event::kind() const {
|
||||
switch (kind_) {
|
||||
case EventKind::kMark:
|
||||
return "mark";
|
||||
case EventKind::kPushRange:
|
||||
return "push";
|
||||
case EventKind::kPopRange:
|
||||
return "pop";
|
||||
}
|
||||
PADDLE_THROW("Unknown EventKind.");
|
||||
}
|
||||
|
||||
double Event::CpuElapsedUs(const Event& e) const {
|
||||
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
|
||||
}
|
||||
|
||||
double Event::CudaElapsedUs(const Event& e) const {
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
|
||||
PADDLE_ENFORCE(e.device() == device());
|
||||
PADDLE_ENFORCE(cudaEventSynchronize(event_));
|
||||
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
|
||||
float ms;
|
||||
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
|
||||
return ms * 1000.0;
|
||||
#else
|
||||
PADDLE_THROW("CUDA is not enabled");
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
static void ForEachDevice(std::function<void(int)> func) {
|
||||
auto original_device = GetCurrentDeviceId();
|
||||
int count = GetCUDADeviceCount();
|
||||
for (int i = 0; i < count; i++) {
|
||||
SetDeviceId(i);
|
||||
func(i);
|
||||
}
|
||||
SetDeviceId(original_device);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline EventList& GetEventList() {
|
||||
if (!g_event_list) {
|
||||
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
|
||||
g_event_list = std::make_shared<EventList>();
|
||||
g_thread_id = g_next_thread_id++;
|
||||
g_all_event_lists.emplace_front(g_event_list);
|
||||
}
|
||||
return *g_event_list;
|
||||
}
|
||||
|
||||
void Mark(const std::string& name, DeviceContext* dev_ctx) {
|
||||
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
|
||||
dev_ctx);
|
||||
}
|
||||
|
||||
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
|
||||
if (g_state == ProfilerState::kDisabled) return;
|
||||
dev_ctx_ = dev_ctx;
|
||||
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
|
||||
dev_ctx_);
|
||||
}
|
||||
|
||||
RecordEvent::~RecordEvent() {
|
||||
if (g_state == ProfilerState::kDisabled) return;
|
||||
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
|
||||
dev_ctx_);
|
||||
}
|
||||
|
||||
void EnableProfiler(ProfilerState state) {
|
||||
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
|
||||
"Can't enbale profling, since the input state is ",
|
||||
"ProfilerState::kDisabled");
|
||||
PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
|
||||
"The profiling state should be disabled when calling ",
|
||||
"EnableProfiler.");
|
||||
g_state = state;
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
if (g_state == ProfilerState::kCUDA) {
|
||||
// Generate some dummy evenets first to reduce the startup overhead.
|
||||
for (int i = 0; i < 5; i++) {
|
||||
ForEachDevice([](int d) {
|
||||
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
|
||||
Mark("_cuda_startup_", dev_ctx);
|
||||
dev_ctx->Wait();
|
||||
});
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Mark the profiling start.
|
||||
Mark("_start_profiler_", nullptr);
|
||||
}
|
||||
|
||||
std::vector<std::vector<Event>> DisableProfiler() {
|
||||
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
|
||||
"Can't disable profiling, since it's not starting.");
|
||||
// Mark the profiling stop.
|
||||
Mark("_stop_profiler_", nullptr);
|
||||
g_state = ProfilerState::kDisabled;
|
||||
std::vector<std::vector<Event>> result;
|
||||
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
|
||||
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
|
||||
++it) {
|
||||
result.emplace_back((*it)->Reduce());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace platform
|
||||
} // namespace paddle
|
@ -0,0 +1,114 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include <forward_list>
|
||||
#include <list>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include "paddle/platform/device_context.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace platform {
|
||||
|
||||
enum EventKind { kMark, kPushRange, kPopRange };
|
||||
|
||||
class Event {
|
||||
public:
|
||||
// The DeviceContext is used to get the cuda stream.
|
||||
// If CPU profiling mode, can pass nullptr.
|
||||
Event(EventKind kind, std::string name, uint32_t thread_id,
|
||||
DeviceContext* dev_ctx);
|
||||
|
||||
std::string kind() const;
|
||||
std::string name() const { return name_; }
|
||||
bool has_cuda() const { return has_cuda_; }
|
||||
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
cudaEvent_t event() const { return event_; }
|
||||
int device() const { return device_; }
|
||||
#endif
|
||||
|
||||
double CpuElapsedUs(const Event& e) const;
|
||||
double CudaElapsedUs(const Event& e) const;
|
||||
|
||||
private:
|
||||
EventKind kind_;
|
||||
std::string name_;
|
||||
uint32_t thread_id_;
|
||||
int64_t cpu_ns_;
|
||||
bool has_cuda_;
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
cudaEvent_t event_ = nullptr;
|
||||
int device_ = -1;
|
||||
#endif
|
||||
};
|
||||
|
||||
struct EventList {
|
||||
constexpr static size_t kMB = 1024 * 1024;
|
||||
constexpr static size_t kEventBlockSize = 16 * kMB;
|
||||
constexpr static size_t kEventSize = sizeof(Event);
|
||||
constexpr static size_t kEventAlign = alignof(Event);
|
||||
constexpr static size_t kNumBlock =
|
||||
kEventBlockSize /
|
||||
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
|
||||
|
||||
template <typename... Args>
|
||||
void Record(Args&&... args) {
|
||||
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
|
||||
event_blocks.emplace_front();
|
||||
event_blocks.front().reserve(kNumBlock);
|
||||
}
|
||||
event_blocks.front().emplace_back(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
std::vector<Event> Reduce() {
|
||||
std::vector<Event> result;
|
||||
for (auto& block : event_blocks) {
|
||||
result.insert(result.begin(), std::make_move_iterator(block.begin()),
|
||||
std::make_move_iterator(block.end()));
|
||||
}
|
||||
event_blocks.clear();
|
||||
return result;
|
||||
}
|
||||
|
||||
std::forward_list<std::vector<Event>> event_blocks;
|
||||
};
|
||||
|
||||
enum ProfilerState {
|
||||
kDisabled, // disabled state
|
||||
kCPU, // CPU profiling state
|
||||
kCUDA, // GPU profiling state
|
||||
};
|
||||
|
||||
void Mark(const std::string& name, DeviceContext* dev_ctx);
|
||||
|
||||
struct RecordEvent {
|
||||
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
|
||||
|
||||
~RecordEvent();
|
||||
|
||||
// The device context is used by Event to get the current cuda stream.
|
||||
DeviceContext* dev_ctx_;
|
||||
};
|
||||
|
||||
// Enable the profiling function.
|
||||
void EnableProfiler(ProfilerState state);
|
||||
|
||||
// Return the event list of all threads. Asummed the returned value calls
|
||||
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
|
||||
std::vector<std::vector<Event>> DisableProfiler();
|
||||
|
||||
} // namespace platform
|
||||
} // namespace paddle
|
@ -0,0 +1,98 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/platform/profiler.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
TEST(Event, CpuElapsedTime) {
|
||||
using paddle::platform::Event;
|
||||
using paddle::platform::EventKind;
|
||||
|
||||
Event start_event(EventKind::kPushRange, "test", 0, nullptr);
|
||||
EXPECT_TRUE(start_event.has_cuda() == false);
|
||||
int counter = 0;
|
||||
while (counter != 1000) {
|
||||
counter++;
|
||||
}
|
||||
Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
|
||||
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
|
||||
}
|
||||
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
TEST(Event, CudaElapsedTime) {
|
||||
using paddle::platform::DeviceContext;
|
||||
using paddle::platform::CUDADeviceContext;
|
||||
using paddle::platform::CUDAPlace;
|
||||
using paddle::platform::Event;
|
||||
using paddle::platform::EventKind;
|
||||
|
||||
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
|
||||
Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
|
||||
EXPECT_TRUE(start_event.has_cuda() == true);
|
||||
int counter = 0;
|
||||
while (counter != 1000) {
|
||||
counter++;
|
||||
}
|
||||
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
|
||||
EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(RecordEvent, RecordEvent) {
|
||||
using paddle::platform::DeviceContext;
|
||||
using paddle::platform::Event;
|
||||
using paddle::platform::EventKind;
|
||||
using paddle::platform::RecordEvent;
|
||||
using paddle::platform::ProfilerState;
|
||||
|
||||
ProfilerState state = ProfilerState::kCPU;
|
||||
DeviceContext* dev_ctx = nullptr;
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
using paddle::platform::CUDADeviceContext;
|
||||
using paddle::platform::CUDAPlace;
|
||||
state = ProfilerState::kCUDA;
|
||||
dev_ctx =
|
||||
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
|
||||
#endif
|
||||
EnableProfiler(state);
|
||||
|
||||
for (int i = 1; i < 5; ++i) {
|
||||
std::string name = "op_" + std::to_string(i);
|
||||
RecordEvent record_event(name, dev_ctx);
|
||||
int counter = 1;
|
||||
while (counter != i * 1000) counter++;
|
||||
}
|
||||
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
|
||||
int cuda_startup_count = 0;
|
||||
int start_profiler_count = 0;
|
||||
int stop_profiler_count = 0;
|
||||
for (size_t i = 0; i < events.size(); ++i) {
|
||||
for (size_t j = 0; j < events[i].size(); ++j) {
|
||||
if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
|
||||
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
|
||||
if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
|
||||
if (events[i][j].name() == "push") {
|
||||
EXPECT_EQ(events[i][j + 1].name(), "pop");
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
|
||||
#else
|
||||
EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(cuda_startup_count % 5, 0);
|
||||
EXPECT_EQ(start_profiler_count, 1);
|
||||
EXPECT_EQ(stop_profiler_count, 1);
|
||||
}
|
Loading…
Reference in new issue