fix the bug of step_trace

pull/8787/head
gzhcv 4 years ago
parent 8239407bf3
commit 3d6c1fc5f6

@ -275,7 +275,6 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) {
// write step trace time info into file
uint32_t factor = 10;
std::vector<std::string> op_name_arr;
op_name_arr.push_back(step_trace_op_name.trace_iter_start);
op_name_arr.push_back(step_trace_op_name.trace_fp_start);
op_name_arr.push_back(step_trace_op_name.trace_bp_end);
op_name_arr.push_back(step_trace_op_name.trace_iter_end);

@ -27,11 +27,10 @@ namespace gpu {
constexpr char kFpStartNode[] = "PROFILING_FP_START";
constexpr char kBpEndNode[] = "PROFILING_BP_END";
constexpr char kIterEndNode[] = "PROFILING_ITER_END";
constexpr int fpExistGraphId = 2;
constexpr auto kInitDatasetQueueOpName = "InitDataSetQueue";
uint32_t ProfilingUtils::last_graph_id = 0;
bool ProfilingUtils::have_communication_op = false;
ProfilingTraceInfo ProfilingUtils::profiling_trace = {"", "", "", "", false};
ProfilingTraceInfo ProfilingUtils::profiling_trace = {"", "", ""};
ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr) {
MS_LOG(INFO) << "get current subgraph op name start.";
@ -39,30 +38,21 @@ ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const sessio
if (cnode_exec_order.empty()) {
return profiling_trace;
}
uint32_t current_graph_id = graph_ptr->graph_id();
// current graph id less than last graph id indicates all subgraph have called.
if (current_graph_id < last_graph_id) {
profiling_trace.IsFirstStepEnd = true;
OutputStepTraceOpNameStatus();
return profiling_trace;
}
SetTraceIterStart(cnode_exec_order);
SetTraceIterEnd(cnode_exec_order);
SetTraceFpStart(cnode_exec_order, current_graph_id);
SetTraceFpStart(cnode_exec_order);
SetTraceBpEnd(cnode_exec_order);
GetTraceHccl(cnode_exec_order);
last_graph_id = current_graph_id;
OutputStepTraceOpNameStatus();
return profiling_trace;
}
void ProfilingUtils::OutputStepTraceOpNameStatus() {
if (!profiling_trace.IsValid()) {
MS_LOG(ERROR) << "Did not get all the step_trace op name.";
if (profiling_trace.IsValid()) {
MS_LOG(INFO) << "Get all the step_trace op name.";
}
MS_LOG(INFO) << "[profiling]trace_iter_start: " << profiling_trace.trace_iter_start
<< "trace_fp_start: " << profiling_trace.trace_fp_start
MS_LOG(INFO) << "[profiling]trace_fp_start: " << profiling_trace.trace_fp_start
<< "trace_bp_end: " << profiling_trace.trace_bp_end
<< "trace_iter_end: " << profiling_trace.trace_iter_end;
MS_LOG(INFO) << "get step_trace op name end.";
@ -81,19 +71,7 @@ void ProfilingUtils::GetTraceHccl(const std::vector<CNodePtr> &cnode_exec_order)
}
}
void ProfilingUtils::SetTraceIterStart(const std::vector<CNodePtr> &cnode_exec_order) {
if (!profiling_trace.trace_iter_start.empty()) {
return;
}
auto first_node = cnode_exec_order.front();
MS_EXCEPTION_IF_NULL(first_node);
if (AnfAlgo::GetCNodeName(first_node) == kGetNextOpName) {
profiling_trace.trace_iter_start = first_node->fullname_with_scope();
}
}
void ProfilingUtils::SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order, uint32_t graph_id) {
void ProfilingUtils::SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order) {
if (!profiling_trace.trace_fp_start.empty()) {
return;
}
@ -105,9 +83,20 @@ void ProfilingUtils::SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_ord
return;
}
if (graph_id == fpExistGraphId) {
auto first_node = cnode_exec_order.front();
MS_EXCEPTION_IF_NULL(first_node);
auto first_node = cnode_exec_order.front();
MS_EXCEPTION_IF_NULL(first_node);
auto node_name = AnfAlgo::GetCNodeName(first_node);
if (node_name == kInitDatasetQueueOpName) {
return;
}
if (node_name == kGetNextOpName) {
if (cnode_exec_order.size() > 1) {
profiling_trace.trace_fp_start = cnode_exec_order.at(1)->fullname_with_scope();
} else {
MS_LOG(ERROR) << "No Op Behind the GetNext Op" << std::endl;
}
} else {
profiling_trace.trace_fp_start = first_node->fullname_with_scope();
}
}

@ -30,23 +30,17 @@ namespace profiler {
namespace gpu {
struct ProfilingTraceInfo {
// support get all the op name from environment variable
// iteration start op is GetNext
std::string trace_iter_start;
// fp start op is the first op of Graph that ID is 2
// fp start op is the first op in all subgraph except data related op
std::string trace_fp_start;
// bp end op is the input node op of the last communication op (if exist)
std::string trace_bp_end;
// iteration end op is the last executed op
std::string trace_iter_end;
bool IsFirstStepEnd;
// profiling specific op, such as AllReduce;
std::vector<std::string> trace_custom_node;
bool IsValid() const {
return !(trace_iter_start.empty() || trace_fp_start.empty() || trace_bp_end.empty() || trace_iter_end.empty());
}
bool IsValid() const { return !(trace_fp_start.empty() || trace_bp_end.empty() || trace_iter_end.empty()); }
};
class ProfilingUtils {
@ -61,13 +55,11 @@ class ProfilingUtils {
static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr);
static void OutputStepTraceOpNameStatus();
static uint32_t last_graph_id;
static bool have_communication_op;
static ProfilingTraceInfo profiling_trace;
private:
static void SetTraceIterStart(const std::vector<CNodePtr> &cnode_exec_order);
static void SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order, uint32_t graph_id);
static void SetTraceFpStart(const std::vector<CNodePtr> &cnode_exec_order);
static void SetTraceBpEnd(const std::vector<CNodePtr> &cnode_exec_order);
static void SetTraceIterEnd(const std::vector<CNodePtr> &cnode_exec_order);
static std::string GetGraphSecondLastKernelName(const std::vector<CNodePtr> &cnode_exec_order);

@ -587,14 +587,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_inst);
static bool FlagGetStepTraceOpName = false;
if (!FlagGetStepTraceOpName) {
if (is_first_step_map_[graph->graph_id()]) {
profiler::gpu::ProfilingTraceInfo profiling_trace =
profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph));
if (profiling_trace.IsFirstStepEnd) {
FlagGetStepTraceOpName = true;
profiler_inst->SetStepTraceOpName(profiling_trace);
}
profiler_inst->SetStepTraceOpName(profiling_trace);
}
for (const auto &kernel : kernels) {

@ -43,6 +43,7 @@ class BaseStepTraceParser:
job_id (int): The job id used to define the start of new step. Default: 0.
skip_first_step (bool): Whether skip the first step or not.
"""
def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False):
self._input_dir = input_dir
self._output_path = output_file_path
@ -333,6 +334,7 @@ class BaseStepTraceParser:
class GpuStepTraceParser(BaseStepTraceParser):
"""The parser for gpu step trace data."""
def record_point_info(self, source_file, output_path):
"""
Record point info into json.
@ -344,7 +346,7 @@ class GpuStepTraceParser(BaseStepTraceParser):
Returns:
dict, parsed point info.
"""
fp_start, bp_end = 1, 2
fp_start, bp_end = 0, 1
try:
with open(source_file, 'r') as f:
lines = f.readlines()
@ -374,7 +376,7 @@ class GpuStepTraceParser(BaseStepTraceParser):
def _parse(self, source_file):
"""Parse source step trace files."""
log.info("Start to parse step trace file.")
iter_start, fp_start, bp_end, iter_end = 0, 1, 2, 3
fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3
reduce_start = 4
start_time, end_time = 0, 1
@ -382,15 +384,16 @@ class GpuStepTraceParser(BaseStepTraceParser):
try:
with open(source_file, 'r') as f:
lines = f.readlines()
step_trace_info_all = [line.strip().split() for line in lines]
step_trace_info_all = [line.strip().split()[1:] for line in lines]
num_of_step = len(step_trace_info_all[0])
# in callback mode that set the profiling step range, each op count is not equal
step_trace_info_all = [line[-num_of_step:] for line in step_trace_info_all]
iter_start_info = [step_trace_info_all[fp_start][0]] + \
step_trace_info_all[iter_end][:num_of_step]
step_trace_info_all.insert(iter_start, iter_start_info)
except (IOError, OSError) as err:
log.warning(f'Failed to read {source_file}', err)
raise ProfilerIOException
for step_num in range(1, num_of_step):
for step_num in range(num_of_step):
step_trace = {
'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]),
'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]),

Loading…
Cancel
Save