add support for nested profiling event and printing in different level (#22061)

* add support for nested profiling event and printing in different level
release/1.7
wangchaochaohu 6 years ago committed by GitHub
parent 019e20195c
commit c3876cf82d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -166,16 +166,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif
}
// The profile has a process-wide mutex, results in serious performance
// issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::RecordEvent record_event(Type());
RunImpl(scope, place);
} else {
{
platform::RecordEvent record_event(Type() + "_op");
RunImpl(scope, place);
}
VLOG(3) << place << " " << DebugStringEx(&scope);
} catch (platform::EnforceNotMet& exception) {
framework::InsertCallStackInfo(Type(), Attrs(), &exception);
@ -953,9 +948,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope =
PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
Scope* transfer_scope = nullptr;
{
platform::RecordEvent record_event("prepare_data");
transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars,
runtime_ctx);
}
// exec scope is the scope that kernel actually executed on.
const Scope& exec_scope =
(transfer_scope == nullptr ? scope : *transfer_scope);
@ -965,6 +963,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
if (!all_kernels_must_compute_runtime_shape_) {
platform::RecordEvent record_event("infer_shape");
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->InferShape(&infer_shape_ctx);
}
@ -975,8 +974,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
{
platform::RecordEvent record_event("compute");
(*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
kernel_configs));
}
if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered.

@ -446,6 +446,11 @@ class DeviceTracerImpl : public DeviceTracer {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
Event *parent = e->parent();
while (parent) {
parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
parent = parent->parent();
}
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
@ -453,6 +458,11 @@ class DeviceTracerImpl : public DeviceTracer {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
Event *parent = e->parent();
while (parent) {
parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
parent = parent->parent();
}
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
@ -622,7 +632,13 @@ DeviceTracer *GetDeviceTracer() {
return tracer;
}
void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
void SetCurAnnotation(Event *event) {
if (!annotation_stack.empty()) {
event->set_parent(annotation_stack.back());
event->set_name(annotation_stack.back()->name() + "/" + event->name());
}
annotation_stack.push_back(event);
}
void ClearCurAnnotation() { annotation_stack.pop_back(); }

@ -32,8 +32,11 @@ class Event {
Event(EventType type, std::string name, uint32_t thread_id);
const EventType& type() const;
Event* parent() const { return parent_; }
void set_parent(Event* parent) { parent_ = parent; }
std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; }
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
@ -47,9 +50,11 @@ class Event {
private:
EventType type_;
std::string name_;
std::string name_{};
Event* parent_{nullptr};
uint32_t thread_id_;
int64_t cpu_ns_;
bool visited_status_{false};
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save