!82 profiling feature enhancement

Merge pull request !82 from caifubi/dev-profiling
pull/82/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 2eb71103f9

@ -702,7 +702,7 @@ void AscendStreamAssign::PrintGraphExeOrders(const shared_ptr<mindspore::session
<< AnfAlgo::GetStreamId(cur_cnode_ptr) << "], event_id["
<< GetValue<uint32_t>(primitive->GetAttr(kAttrEventId)) << "]";
} else {
MS_LOG(INFO) << "node name[" << AnfAlgo::GetCNodeName(cur_cnode_ptr) << "], logic id["
MS_LOG(INFO) << "node name[" << cur_cnode_ptr->fullname_with_scope() << "], logic id["
<< AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) << "], stream id["
<< AnfAlgo::GetStreamId(cur_cnode_ptr) << "]";
}

@ -29,10 +29,6 @@ namespace ascend {
// PROFILING_CUSTOM_LOGID_START 3
const uint64_t kProfilingFpStartLogId = 1;
const uint64_t kProfilingBpEndLogId = 2;
const uint64_t kProfilingAllReduce1Start = 3;
const uint64_t kProfilingAllReduce1End = 4;
const uint64_t kProfilingAllReduce2Start = 5;
const uint64_t kProfilingAllReduce2End = 6;
const uint64_t kProfilingIterEndLogId = 255;
class ProfilingEngineImpl;

File diff suppressed because it is too large Load Diff

@ -19,63 +19,102 @@
#include <memory>
#include <string>
#include <vector>
#include <set>
#include <unordered_map>
#include "session/kernel_graph.h"
#include "utils/contract.h"
namespace mindspore {
namespace device {
namespace ascend {
struct ProfilingTraceInfo {
// execute order's first execute op(like: Cast or Four2Five ...), except tdt op(GetNext ...)
std::string profiling_trace_begin;
std::string trace_begin;
// get first net_output(apply kernel) from graph outputs: fp ->net_output<- bp
std::string profiling_trace_bp_end;
std::string trace_bp_end;
// execute order's end execute (like: Conv2DBackpropFilter)
std::string profiling_trace_netoutput;
std::string trace_netoutput;
std::string profiling_allreduce1_start;
std::string profiling_allreduce1_end;
std::string profiling_allreduce2_start;
std::string profiling_allreduce2_end;
// profiling specific op, such as AllReduce;
std::set<std::string> trace_custom_node;
// 1. insert profiling_trace_begin if profiling_trace_bp_end is not empty.
// 2. op lanuch get task info with callback func.
// 3. insert profiling_trace_bp_end.
// 4. insert profiling_trace_net_output if profiling_trace_bp_end is not empty.
bool IsValid() const { return !(profiling_trace_begin.empty() || profiling_trace_bp_end.empty()); }
bool IsValid() const { return !(trace_begin.empty() || trace_bp_end.empty() || trace_netoutput.empty()); }
};
struct ProfilingContent {
// true -send data from device to host and finish profiling
bool notify;
uint64_t profiler_trace_id;
uint32_t flags;
};
class ProfilingUtils {
public:
ProfilingUtils() = default;
~ProfilingUtils() = default;
static bool GetProfilingTraceInfo(const std::shared_ptr<session::KernelGraph> &graph_ptr,
ProfilingTraceInfo *profiling_trace_info);
static void ProfilingTraceFpStart(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node,
const ProfilingTraceInfo &profiling_trace_info, std::vector<CNodePtr> *kernel_list);
static void ProfilingAllReduce(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node,
int job_id, const std::string &profiling_node_name,
std::vector<CNodePtr> *kernel_list);
static void ProfilingTraceEnd(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node,
const ProfilingTraceInfo &profiling_trace_info, std::vector<CNodePtr> *kernel_list);
// Insert job_id profiling node and fp_start profiling node.
// Job_id is got from envs, which shound be a number greater than 255
// Fp_start node should been inserted in the start of a network, and the log_id is hard code to 1.
static void ProfilingTraceFpStart(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
NotNull<session::KernelGraph *> graph_ptr,
NotNull<std::vector<CNodePtr> *> kernel_list);
// Insert net output profiling node, which tells the device to stop profiling.
// The notify in struct ProfilingContent should be 'true', which tells the device to send data to host.
static void ProfilingTraceEnd(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
NotNull<session::KernelGraph *> graph_ptr,
NotNull<std::vector<CNodePtr> *> kernel_list);
// Insert bp_end profiling node, which should been inserted after the last backpropagation CNode in the network.
static void ProfilingTraceBpEnd(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
NotNull<session::KernelGraph *> graph_ptr,
NotNull<std::vector<mindspore::CNodePtr> *> kernel_list);
// Mapping graph id and the kernels' name in the graph
static void SetGraphKernelName(uint32_t graph_id, const std::vector<std::string> &kernel_names);
// Mapping task_id and kernel name for device to generate the time cost of specific kernel.
// Device calculate the time cost of the task which is marked by task id.
// But we need data of (kernel name , time cost)
static void ReportProfilingData(uint32_t graph_id, const std::vector<uint32_t> &task_ids);
static const char kProfiling[];
static const char kNotify[];
static const char kProfilerTraceId[];
static const char kFlags[];
// Get profiling trace point from envs.
// export PROFILING_FP_START='full name of the first cnode to execute'
// export PROFILING_BP_END='full name of the last backpropagation cnode to execute'
// export PROFILING_ITER_END='full name of last cnode in graph to execute'
// And other cnode, like AllReduce, export PROFILING_CUSTOM_1='full name of AllReduce cnode'
// GetNext, export PROFIFLING_CUSTOM_2='full name fo GetNext cnode'
// The variable i in PROFILING_CUSTOM_i should start from 1 without interruption.
static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull<session::KernelGraph *> graph_ptr);
// Insert two profiling trace points, one in front and one behind
static void ProfilingCustomOp(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
NotNull<session::KernelGraph *> graph_ptr,
NotNull<std::vector<mindspore::CNodePtr> *> kernel_list);
inline static constexpr char kProfiling[] = "Profiling";
inline static constexpr char kNotify[] = "notify";
inline static constexpr char kProfilerTraceId[] = "profiler_trace_id";
inline static constexpr char kFlags[] = "flags";
private:
static bool GetNetOutput(AnfNodePtr anf_node, std::string *profiling_trace_net_output);
static CNodePtr CreateProfilingCNode(const std::shared_ptr<session::KernelGraph> &graph_ptr, bool notify,
uint64_t profiler_trace_id, uint32_t flags);
static NotNull<CNodePtr> CreateProfilingCNode(const ProfilingContent &profiling_content,
NotNull<session::KernelGraph *> graph_ptr);
static CNodePtr CreateProfilingCNodeWithStream(const AnfNodePtr &anf_node, const ProfilingContent &profiling_content,
NotNull<session::KernelGraph *> graph_ptr);
static std::string GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order);
static std::string GetTraceBpEnd();
static std::string GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order);
// graph id --> (kernel name list)
static std::unordered_map<uint32_t, std::vector<std::string>> graph_kernel_name_;
static uint32_t custom_node_index_;
};
} // namespace ascend
} // namespace device

@ -438,23 +438,22 @@ void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
MS_LOG(INFO) << "---------------- LoadSwitchInputs End--";
}
void KernelAdjust::Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
MS_LOG(INFO) << "No need to profiling";
return;
}
ProfilingTraceInfo profiling_trace_info;
if (ProfilingUtils::GetProfilingTraceInfo(kernel_graph_ptr, &profiling_trace_info)) {
InsertProfilingKernel(kernel_graph_ptr, profiling_trace_info);
} else {
MS_LOG(WARNING) << "[profiling] GetProfilingTraceInfo failed";
ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GetProfilingTraceFromEnv(kernel_graph_ptr);
if (!profiling_trace_info.IsValid()) {
MS_LOG(WARNING) << "[profiling] no profiling node found!";
return;
}
InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr);
}
void KernelAdjust::InsertProfilingKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
const ProfilingTraceInfo &profiling_trace_info) {
void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
NotNull<session::KernelGraph *> kernel_graph_ptr) {
MS_LOG(INFO) << "[profiling] Insert profiling kernel start";
MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
if (!profiling_trace_info.IsValid()) {
MS_LOG(WARNING) << "Profiling trace point not found";
return;
@ -462,18 +461,12 @@ void KernelAdjust::InsertProfilingKernel(const std::shared_ptr<session::KernelGr
std::vector<CNodePtr> new_cnode_list;
std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order();
for (const auto &cnode_ptr : cnode_ptr_list) {
ProfilingUtils::ProfilingTraceFpStart(kernel_graph_ptr, cnode_ptr, profiling_trace_info, &new_cnode_list);
ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce1Start,
profiling_trace_info.profiling_allreduce1_start, &new_cnode_list);
ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce2Start,
profiling_trace_info.profiling_allreduce2_start, &new_cnode_list);
ProfilingUtils::ProfilingTraceFpStart(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
new_cnode_list.emplace_back(cnode_ptr);
ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce1End,
profiling_trace_info.profiling_allreduce1_end, &new_cnode_list);
ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce2End,
profiling_trace_info.profiling_allreduce2_end, &new_cnode_list);
ProfilingUtils::ProfilingTraceEnd(kernel_graph_ptr, cnode_ptr, profiling_trace_info, &new_cnode_list);
ProfilingUtils::ProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
ProfilingUtils::ProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
ProfilingUtils::ProfilingTraceEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
}
kernel_graph_ptr->set_execution_order(new_cnode_list);
}

@ -48,7 +48,7 @@ class KernelAdjust {
void SetStreamSwitchOps(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
bool StepLoadCtrlInputs(const std::shared_ptr<session::Context> &context,
const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
void Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
void Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr);
static bool NeedInsertSwitch();
CNodePtr CreateSteamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
@ -66,8 +66,8 @@ class KernelAdjust {
kernel::KernelBuildInfo::KernelBuildInfoBuilder CreateMngKernelBuilder(const std::vector<std::string> &formats,
const std::vector<TypeId> &type_ids);
void LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs);
void InsertProfilingKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
const ProfilingTraceInfo &profiling_trace_info);
void InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
NotNull<session::KernelGraph *> kernel_graph_ptr);
};
} // namespace device
} // namespace mindspore

@ -246,7 +246,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
kernel_graph->SetExecOrderByDefault();
if (save_graphs) {
std::string file_path = save_graphs_path + "/" + "hwopt_d_end.ir";
DumpIR(file_path, kernel_graph);
DumpIR(file_path, kernel_graph, true);
DumpIRProto(kernel_graph, "after_hwopt");
}
}

@ -136,7 +136,7 @@ void AscendSession::BuildGraph(GraphId graph_id) {
// Assign streams for control sink and hccl and so on
AssignStream(graph);
device::KernelAdjust::GetInstance().Profiling(graph);
device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get()));
// build kernel if node is cnode
BuildKernel(graph);
auto ms_context = MsContext::GetInstance();

@ -42,6 +42,6 @@ bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::Context> &c
return true;
}
bool KernelAdjust::NeedInsertSwitch() { return true; }
void KernelAdjust::Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { return; }
void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) { return; }
} // namespace device
} // namespace mindspore

Loading…
Cancel
Save