From a6d30a860723190900e713d36d2517d0f3dec98d Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 23 Jul 2018 16:33:41 +0800 Subject: [PATCH 1/3] profiler support cpu --- paddle/fluid/platform/device_tracer.cc | 103 +++++++++++++++++++++++-- paddle/fluid/platform/device_tracer.h | 9 +++ paddle/fluid/platform/profiler.cc | 7 -- python/paddle/fluid/profiler.py | 4 +- 4 files changed, 107 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d9e2afadaf..c376480e94 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -361,27 +361,116 @@ class DeviceTracerDummy : public DeviceTracer { public: DeviceTracerDummy() {} - void AddAnnotation(uint64_t id, const std::string &anno) {} + void AddAnnotation(uint64_t id, const std::string &anno) { + std::lock_guard l(trace_mu_); + correlations_[id] = anno; + } void AddCPURecords(const std::string &anno, uint64_t start_ns, - uint64_t end_ns, int64_t device_id, int64_t thread_id) {} + uint64_t end_ns, int64_t device_id, int64_t thread_id) { + if (anno.empty()) { + VLOG(1) << "Empty timeline annotation."; + return; + } + std::lock_guard l(trace_mu_); + cpu_records_.push_back( + CPURecord{anno, start_ns, end_ns, device_id, thread_id}); + } void AddMemRecords(const std::string &name, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t stream_id, - uint32_t correlation_id, uint64_t bytes) {} + uint32_t correlation_id, uint64_t bytes) { + // 0 means timestamp information could not be collected for the kernel. + if (start_ns == 0 || end_ns == 0) { + VLOG(3) << name << " cannot be traced"; + return; + } + std::lock_guard l(trace_mu_); + mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, + stream_id, correlation_id, bytes}); + } void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, int64_t stream_id, uint32_t correlation_id) {} - bool IsEnabled() { return false; } + bool IsEnabled() { + std::lock_guard l(trace_mu_); + return enabled_; + } - void Enable() {} + void Enable() { + std::lock_guard l(trace_mu_); + if (enabled_) { + return; + } + int64_t start_ns_ = PosixInNsec(); + VLOG(3) << "start_ns_ = " << start_ns_; + enabled_ = true; + } + + void Disable() { + std::lock_guard l(trace_mu_); + uint64_t end_ns_ = PosixInNsec(); + VLOG(3) << "end_ns_ = " << end_ns_; + enabled_ = false; + } proto::Profile GenProfile(const std::string &profile_path) { - return proto::Profile(); + std::lock_guard l(trace_mu_); + proto::Profile profile_pb; + profile_pb.set_start_ns(start_ns_); + profile_pb.set_end_ns(end_ns_); + for (const KernelRecord &r : kernel_records_) { + if (correlations_.find(r.correlation_id) == correlations_.end()) { + fprintf(stderr, "cannot relate a kernel activity\n"); + continue; + } + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); + event->set_name(correlations_.at(r.correlation_id)); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.stream_id); + event->set_device_id(r.device_id); + } + + for (const CPURecord &r : cpu_records_) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); + } + for (const MemRecord &r : mem_records_) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.stream_id); + event->set_device_id(r.device_id); + event->mutable_memcopy()->set_bytes(r.bytes); + } + std::ofstream profile_f; + profile_f.open(profile_path, std::ios::out | std::ios::trunc); + std::string profile_str; + profile_pb.SerializeToString(&profile_str); + profile_f << profile_str; + profile_f.close(); + return profile_pb; } - void Disable() {} + private: + std::mutex trace_mu_; + bool enabled_; + uint64_t start_ns_; + uint64_t end_ns_; + std::vector kernel_records_; + std::vector mem_records_; + std::vector cpu_records_; + std::unordered_map correlations_; }; void CreateTracer(DeviceTracer **t) { diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 0375c7439c..83bc909117 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include // NOLINT #include #include "paddle/fluid/platform/dynload/cupti.h" @@ -25,6 +28,12 @@ namespace platform { // WARN: Under Development. Don't depend on it yet. ////////////////////// +inline uint64_t PosixInNsec() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); +} + // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 2. Collect cuda statistics: start/end ts, memory, etc. diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 01de9d7041..dc3f388f89 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include -#include #include #include #include @@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() { .count(); } -inline uint64_t PosixInNsec() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); -} - Event::Event(EventType type, std::string name, uint32_t thread_id, const DeviceContext* dev_ctx) : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) { diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 6a321ae024..a9f8a7f03b 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -218,7 +218,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """The profiler interface. Different from cuda_profiler, this profiler can be used to profile both CPU - and GPU program. By defalut, it records the CPU and GPU operator kernels, + and GPU program. By default, it records the CPU and GPU operator kernels, if you want to profile other program, you can refer the profiling tutorial to add more records in C++ code. @@ -231,7 +231,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): state (string) : The profiling state, which should be 'CPU' or 'GPU', telling the profiler to use CPU timer or GPU timer for profiling. Although users may have already specified the execution place - (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler + (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler would not inherit this place. sorted_key (string) : If None, the profiling results will be printed in the order of first end time of events. Otherwise, the profiling From e008600b081cb3038df256abf28f4d7a635e0613 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Fri, 10 Aug 2018 10:57:58 +0800 Subject: [PATCH 2/3] optimize code --- paddle/fluid/platform/device_tracer.cc | 137 ++----------------------- 1 file changed, 11 insertions(+), 126 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 159b867c3f..e5425209f1 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -189,6 +189,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, } } // namespace +#endif // PADDLE_WITH_CUPTI + class DeviceTracerImpl : public DeviceTracer { public: DeviceTracerImpl() : enabled_(false) {} @@ -244,6 +246,8 @@ class DeviceTracerImpl : public DeviceTracer { if (enabled_) { return; } + +#ifdef PADDLE_WITH_CUPTI EnableActivity(); // Register callbacks for buffer requests and completed by CUPTI. @@ -262,6 +266,7 @@ class DeviceTracerImpl : public DeviceTracer { dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); +#endif // PADDLE_WITH_CUPTI enabled_ = true; } @@ -313,16 +318,20 @@ class DeviceTracerImpl : public DeviceTracer { } void Disable() { + std::lock_guard l(trace_mu_); +#ifdef PADDLE_WITH_CUPTI // flush might cause additional calls to DeviceTracker. dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); - std::lock_guard l(trace_mu_); + DisableActivity(); dynload::cuptiUnsubscribe(subscriber_); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); +#endif // PADDLE_WITH_CUPTI enabled_ = false; } private: +#ifdef PADDLE_WITH_CUPTI static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const void *cbdata) { auto *cbInfo = reinterpret_cast(cbdata); @@ -340,126 +349,8 @@ class DeviceTracerImpl : public DeviceTracer { VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid; } } - - std::mutex trace_mu_; - bool enabled_; - uint64_t start_ns_; - uint64_t end_ns_; - std::vector kernel_records_; - std::vector mem_records_; - std::vector cpu_records_; - std::unordered_map correlations_; CUpti_SubscriberHandle subscriber_; -}; - #endif // PADDLE_WITH_CUPTI - -class DeviceTracerDummy : public DeviceTracer { - public: - DeviceTracerDummy() {} - - void AddAnnotation(uint64_t id, const std::string &anno) { - std::lock_guard l(trace_mu_); - correlations_[id] = anno; - } - - void AddCPURecords(const std::string &anno, uint64_t start_ns, - uint64_t end_ns, int64_t device_id, int64_t thread_id) { - if (anno.empty()) { - VLOG(1) << "Empty timeline annotation."; - return; - } - std::lock_guard l(trace_mu_); - cpu_records_.push_back( - CPURecord{anno, start_ns, end_ns, device_id, thread_id}); - } - - void AddMemRecords(const std::string &name, uint64_t start_ns, - uint64_t end_ns, int64_t device_id, int64_t stream_id, - uint32_t correlation_id, uint64_t bytes) { - // 0 means timestamp information could not be collected for the kernel. - if (start_ns == 0 || end_ns == 0) { - VLOG(3) << name << " cannot be traced"; - return; - } - std::lock_guard l(trace_mu_); - mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, - stream_id, correlation_id, bytes}); - } - - void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, - int64_t stream_id, uint32_t correlation_id) {} - - bool IsEnabled() { - std::lock_guard l(trace_mu_); - return enabled_; - } - - void Enable() { - std::lock_guard l(trace_mu_); - if (enabled_) { - return; - } - int64_t start_ns_ = PosixInNsec(); - VLOG(3) << "start_ns_ = " << start_ns_; - enabled_ = true; - } - - void Disable() { - std::lock_guard l(trace_mu_); - uint64_t end_ns_ = PosixInNsec(); - VLOG(3) << "end_ns_ = " << end_ns_; - enabled_ = false; - } - - proto::Profile GenProfile(const std::string &profile_path) { - std::lock_guard l(trace_mu_); - proto::Profile profile_pb; - profile_pb.set_start_ns(start_ns_); - profile_pb.set_end_ns(end_ns_); - for (const KernelRecord &r : kernel_records_) { - if (correlations_.find(r.correlation_id) == correlations_.end()) { - fprintf(stderr, "cannot relate a kernel activity\n"); - continue; - } - auto *event = profile_pb.add_events(); - event->set_type(proto::Event::GPUKernel); - event->set_name(correlations_.at(r.correlation_id)); - event->set_start_ns(r.start_ns); - event->set_end_ns(r.end_ns); - event->set_sub_device_id(r.stream_id); - event->set_device_id(r.device_id); - } - - for (const CPURecord &r : cpu_records_) { - auto *event = profile_pb.add_events(); - event->set_type(proto::Event::CPU); - event->set_name(r.name); - event->set_start_ns(r.start_ns); - event->set_end_ns(r.end_ns); - event->set_sub_device_id(r.thread_id); - event->set_device_id(r.device_id); - } - for (const MemRecord &r : mem_records_) { - auto *event = profile_pb.add_events(); - event->set_type(proto::Event::GPUKernel); - event->set_name(r.name); - event->set_start_ns(r.start_ns); - event->set_end_ns(r.end_ns); - event->set_sub_device_id(r.stream_id); - event->set_device_id(r.device_id); - event->mutable_memcopy()->set_bytes(r.bytes); - } - std::ofstream profile_f; - profile_f.open(profile_path, std::ios::out | std::ios::trunc); - std::string profile_str; - profile_pb.SerializeToString(&profile_str); - profile_f << profile_str; - profile_f.close(); - return profile_pb; - } - - private: std::mutex trace_mu_; bool enabled_; uint64_t start_ns_; @@ -470,13 +361,7 @@ class DeviceTracerDummy : public DeviceTracer { std::unordered_map correlations_; }; -void CreateTracer(DeviceTracer **t) { -#ifdef PADDLE_WITH_CUPTI - *t = new DeviceTracerImpl(); -#else - *t = new DeviceTracerDummy(); -#endif // PADDLE_WITH_CUPTI -} +void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); } DeviceTracer *GetDeviceTracer() { std::call_once(tracer_once_flag, CreateTracer, &tracer); From 5a6c3cd9e08fb4a39605b53bb4a8d2520bcbbb75 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 13 Aug 2018 11:02:18 +0800 Subject: [PATCH 3/3] fix profiler dead lock --- paddle/fluid/platform/device_tracer.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index e5425209f1..dc1d751141 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -318,11 +318,12 @@ class DeviceTracerImpl : public DeviceTracer { } void Disable() { - std::lock_guard l(trace_mu_); #ifdef PADDLE_WITH_CUPTI // flush might cause additional calls to DeviceTracker. dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); - +#endif // PADDLE_WITH_CUPTI + std::lock_guard l(trace_mu_); +#ifdef PADDLE_WITH_CUPTI DisableActivity(); dynload::cuptiUnsubscribe(subscriber_); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));