Add memory profiler (#16137)

test=develop
6 years ago · 0979956619
parent 05993c3ff3
commit 0979956619
9 changed files with 505 additions and 77 deletions
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
+cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 if (WITH_GPU)
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include <memory>
 #include <string>
 #include <utility>
@ -24,9 +22,11 @@
 #endif
 #include "glog/logging.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 }  // namespace legacy
 namespace allocation {
 LegacyMemMonitor GPUMemMonitor;
 Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
-  return new Allocation(ptr, size, place_);
+  auto *tmp_alloc = new Allocation(ptr, size, place_);
  platform::MemEvenRecorder::Instance().PushMemRecord(
      static_cast<void *>(tmp_alloc), place_, size);
  return tmp_alloc;
 }
 void LegacyAllocator::Free(Allocation *allocation) {
  boost::apply_visitor(
      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
      allocation->place());
  platform::MemEvenRecorder::Instance().PopMemRecord(
      static_cast<void *>(allocation), place_);
  delete allocation;
 }
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_tracer.h"
 #include <deque>
 #include <forward_list>
@ -30,6 +29,8 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 namespace paddle {
@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
                                      stream_id, correlation_id, bytes});
  }
  void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                        const Place &place, const std::string &alloc_in,
                        const std::string &free_in, int64_t thread_id) {
    if (0 == start_ns || 0 == end_ns) {
      VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
      return;
    }
    thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
        nullptr;
    if (local_mem_info_record == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      mem_info_record_.emplace_front();
      local_mem_info_record = &mem_info_record_.front();
    }
    local_mem_info_record->emplace_front(MemInfoRecord{
        start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
  }
  void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
                            uint64_t end_ns, int64_t device_id,
                            int64_t thread_id, uint32_t correlation_id) {
@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_.clear();
    for (auto &tmp : correlations_pairs) tmp.clear();
    for (auto &tmp : cpu_records_) tmp.clear();
    for (auto &tmp : mem_info_record_) tmp.clear();
    for (auto &tmp : active_kind_records_) tmp.clear();
  }
@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
-    if (correlations_.empty())
+    if (correlations_.empty()) {
-      for (auto &tmp : correlations_pairs)
+      for (auto &tmp : correlations_pairs) {
        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
      }
    }
    for (const KernelRecord &r : kernel_records_) {
      auto *event = profile_pb.add_events();
      event->set_type(proto::Event::GPUKernel);
@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_device_id(r.device_id);
    }
    VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
    for (auto &tmp : cpu_records_) {
      for (const CPURecord &r : tmp) {
        auto *event = profile_pb.add_events();
@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
        event->set_device_id(r.device_id);
      }
    }
    for (auto &tmp : active_kind_records_) {
      for (const ActiveKindRecord &r : tmp) {
        auto *event = profile_pb.add_events();
@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
      event->mutable_memcopy()->set_bytes(r.bytes);
    }
    VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
    for (auto &tmp : mem_info_record_) {
      for (const auto &r : tmp) {
        auto *event = profile_pb.add_mem_events();
        event->set_device_id(0);
        if (platform::is_cpu_place(r.place)) {
          event->set_place(proto::MemEvent::CPUPlace);
        } else if (platform::is_gpu_place(r.place)) {
          event->set_place(proto::MemEvent::CUDAPlace);
          event->set_device_id(
              boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
        } else if (platform::is_cuda_pinned_place(r.place)) {
          event->set_place(proto::MemEvent::CUDAPinnedPlace);
        } else {
          PADDLE_THROW("The current place is not supported.");
        }
        event->set_alloc_in(r.alloc_in);
        event->set_free_in(r.free_in);
        event->set_start_ns(r.start_ns);
        event->set_end_ns(r.end_ns);
        event->set_bytes(r.bytes);
        event->set_thread_id(r.thread_id);
      }
    }
    std::ofstream profile_f;
    profile_f.open(profile_path,
                   std::ios::out | std::ios::trunc | std::ios::binary);
@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
  std::forward_list<KernelRecord> kernel_records_;
  std::forward_list<MemRecord> mem_records_;
  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
  std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
  std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
      correlations_pairs;
@ -575,7 +626,7 @@ Event *CurAnnotation() {
  return annotation_stack.back();
 }
 std::string CurAnnotationName() {
-  if (annotation_stack.empty()) return "";
+  if (annotation_stack.empty()) return "Unknown";
  return annotation_stack.back()->name();
 }
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"
@ -47,6 +48,7 @@ class DeviceTracer {
    int64_t stream_id;
    uint32_t correlation_id;
  };
  struct CPURecord {
    std::string name;
    uint64_t start_ns;
@ -54,6 +56,7 @@ class DeviceTracer {
    int64_t device_id;
    int64_t thread_id;
  };
  struct MemRecord {
    std::string name;
    uint64_t start_ns;
@ -63,6 +66,17 @@ class DeviceTracer {
    uint32_t correlation_id;
    uint64_t bytes;
  };
  struct MemInfoRecord {
    uint64_t start_ns;
    uint64_t end_ns;
    size_t bytes;
    Place place;
    int64_t thread_id;
    std::string alloc_in;
    std::string free_in;
  };
  struct ActiveKindRecord {
    std::string name;
    uint64_t start_ns;
@ -71,6 +85,7 @@ class DeviceTracer {
    int64_t thread_id;
    uint32_t correlation_id;
  };
  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
  virtual void Enable() = 0;
@ -97,6 +112,12 @@ class DeviceTracer {
                                    int64_t thread_id,
                                    uint32_t correlation_id) = 0;
  virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
                                size_t bytes, const Place& place,
                                const std::string& alloc_in,
                                const std::string& free_in,
                                int64_t thread_id) = 0;
  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
  virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace platform {
@ -64,5 +66,36 @@ class Event {
 #endif
 #endif
 };
 class MemEvent {
 public:
  MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
           Place place, int64_t thread_id, const std::string& annotation)
      : type_(type),
        start_ns_(start_ns),
        end_ns_(end_ns),
        bytes_(bytes),
        place_(place),
        thread_id_(thread_id),
        annotation_(annotation) {}
  const EventType& type() const { return type_; }
  uint64_t start_ns() const { return start_ns_; }
  uint64_t end_ns() const { return end_ns_; }
  size_t bytes() const { return bytes_; }
  Place place() const { return place_; }
  int64_t thread_id() const { return thread_id_; }
  const std::string& annotation() const { return annotation_; }
 private:
  EventType type_;
  uint64_t start_ns_ = 0;
  uint64_t end_ns_ = 0;
  size_t bytes_;
  Place place_;
  int64_t thread_id_;
  std::string annotation_;
 };
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@ -15,10 +15,17 @@ limitations under the License. */
 #pragma once
 #include <forward_list>
 #include <list>
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@ -34,8 +41,41 @@ enum ProfilerState {
 void Mark(const std::string& name);
-Event* PushEvent(const std::string& name);
+void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                  const Place& place);
 void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                 const Place& place);
 struct MemEvenRecorder {
 public:
  void PushMemRecord(const void* ptr, const Place& place, size_t size);
  void PopMemRecord(const void* ptr, const Place& place);
  void Flush();
  static MemEvenRecorder& Instance() { return recorder; }
 private:
  struct RecordMemEvent {
    RecordMemEvent(const Place& place, size_t bytes);
    ~RecordMemEvent();
    Place place_;
    size_t bytes_;
    uint64_t start_ns_;
    uint64_t end_ns_;
    std::string alloc_in_;
    std::string free_in_;
  };
  static MemEvenRecorder recorder;
  std::map<Place,
           std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
      address_memevent_;
  std::mutex mtx_;
  MemEvenRecorder() {}
  DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
 };
 Event* PushEvent(const std::string& name);
 void PopEvent(const std::string& name);
 struct RecordEvent {
@ -87,6 +127,41 @@ enum EventSortingKey {
  kGPUTime
 };
 template <typename T>
 struct EventList {
  constexpr static size_t kMB = 1024 * 1024;
  constexpr static size_t kEventBlockSize = 16 * kMB;
  constexpr static size_t kEventSize = sizeof(T);
  constexpr static size_t kEventAlign = alignof(T);
  constexpr static size_t kNumBlock =
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
  template <typename... Args>
  T* Record(Args&&... args) {
    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
      event_blocks.emplace_front();
      event_blocks.front().reserve(kNumBlock);
    }
    event_blocks.front().emplace_back(std::forward<Args>(args)...);
    return &event_blocks.front().back();
  }
  std::vector<T> Reduce() {
    std::vector<T> result;
    for (auto& block : event_blocks) {
      result.insert(result.begin(), std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
    return result;
  }
  void Clear() { event_blocks.clear(); }
  std::forward_list<std::vector<T>> event_blocks;
 };
 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@ -34,8 +34,25 @@ message Event {
  optional string detail_info = 9;
 }
 message MemEvent {
  enum Place {
    CUDAPlace = 0;
    CPUPlace = 1;
    CUDAPinnedPlace = 2;
  }
  optional uint64 start_ns = 1;
  optional uint64 end_ns = 2;
  optional uint64 bytes = 3;
  optional Place place = 4;
  optional uint64 thread_id = 5;
  optional uint32 device_id = 6;
  optional string alloc_in = 7;
  optional string free_in = 8;
 }
 message Profile {
  repeated Event events = 1;
  optional uint64 start_ns = 2;
  optional uint64 end_ns = 3;
  repeated MemEvent mem_events = 4;
 }
--- a/tools/timeline.py
+++ b/tools/timeline.py
@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
        event['args'] = args
        self._events.append(event)
    def emit_counter(self, category, name, pid, timestamp, counter, value):
        """Emits a record for a single counter.
        Args:
            category: The event category as string
            name: The event name as string
            pid: Identifier of the process generating this event as integer
            timestamp: The timestamps of this event as long integer
            counter: Name of the counter as string
            value: Value of the counter as integer
            tid: Thread id of the allocation as integer
        """
        event = self._create_event('C', category, name, pid, 0, timestamp)
        event['args'] = {counter: value}
        self._events.append(event)
    def format_to_string(self, pretty=False):
        """Formats the chrome trace to a string.
@ -117,6 +133,7 @@ class Timeline(object):
        self._profile_dict = profile_dict
        self._pid = 0
        self._devices = dict()
        self._mem_devices = dict()
        self._chrome_trace = _ChromeTraceFormatter()
    def _allocate_pid(self):
@ -143,6 +160,45 @@ class Timeline(object):
                        self._devices[(k, event.device_id, "GPUKernel")] = pid
                        self._chrome_trace.emit_pid("%s:gpu:%d" %
                                                    (k, event.device_id), pid)
            for mevent in profile_pb.mem_events:
                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
                            pid)
                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
                            pid)
                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
                    if (k, mevent.device_id, "CUDAPinnedPlace"
                        ) not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id,
                                           "CUDAPinnedPlace")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:cudapinnedplace:%d" %
                            (k, mevent.device_id), pid)
                if (k, 0, "CPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "CPU")] = pid
                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
                                                (k, 0), pid)
                if (k, 0, "GPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "GPU")] = pid
                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
                                                (k, 0), pid)
                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
                    self._chrome_trace.emit_pid(
                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
    def _allocate_events(self):
        for k, profile_pb in six.iteritems(self._profile_dict):
@ -163,9 +219,57 @@ class Timeline(object):
                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
                    event.sub_device_id, 'Op', event.name, args)
    def _allocate_memory_event(self):
        place_to_str = {
            profiler_pb2.MemEvent.CPUPlace: "CPU",
            profiler_pb2.MemEvent.CUDAPlace: "GPU",
            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
        }
        for k, profile_pb in six.iteritems(self._profile_dict):
            mem_list = []
            end_profiler = 0
            for mevent in profile_pb.mem_events:
                crt_info = dict()
                crt_info['time'] = mevent.start_ns
                crt_info['size'] = mevent.bytes
                if mevent.place in place_to_str:
                    place = place_to_str[mevent.place]
                else:
                    place = "UnDefine"
                crt_info['place'] = place
                pid = self._mem_devices[(k, mevent.device_id, place)]
                crt_info['pid'] = pid
                crt_info['thread_id'] = mevent.thread_id
                crt_info['device_id'] = mevent.device_id
                mem_list.append(crt_info)
                crt_info = dict()
                crt_info['place'] = place
                crt_info['pid'] = pid
                crt_info['thread_id'] = mevent.thread_id
                crt_info['device_id'] = mevent.device_id
                crt_info['time'] = mevent.end_ns
                crt_info['size'] = -mevent.bytes
                mem_list.append(crt_info)
                end_profiler = max(end_profiler, crt_info['time'])
            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
            i = 0
            total_size = 0
            while i < len(mem_list):
                total_size += mem_list[i]['size']
                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
                        i + 1]['time']:
                    total_size += mem_list[i + 1]['size']
                    i += 1
                self._chrome_trace.emit_counter(
                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
                    0, total_size)
                i += 1
    def generate_chrome_trace(self):
        self._allocate_pids()
        self._allocate_events()
        self._allocate_memory_event()
        return self._chrome_trace.format_to_string()