@ -17,11 +17,12 @@ from .model import PrimLib
class ParalGain:
def __init__(self, fusion_type, bottleneck, gain, block_assign):
def __init__(self, fusion_type, bottleneck, gain, block_assign, type_info):
self.fusion_type = fusion_type
self.bottleneck = bottleneck
self.gain = gain
self.block_assign = block_assign
self.type_info = type_info
class ScheduleAnalyzer:
@ -30,6 +31,7 @@ class ScheduleAnalyzer:
MAX_SM = 80 # Volta
def __init__(self, graph):
self.graph = graph
@ -132,11 +134,141 @@ class ScheduleAnalyzer:
def suitable_to_pipeline(self):
"""judge whether is suitable to be pipeline optimized"""
# Reduce is not suitable
def _contain_reduce(ops):
for op in ops:
# Reduce may make the tiling bad.
if PrimLib.primtives.get(op.prim, None) == PrimLib.REDUCE:
return True
return False
suitable = True
if _contain_reduce(self.ops):
suitable = False
return suitable
def k_mean(data, class_n=2, exclude_id=()):
Find k clusters in which element is close to each other.
data (list): Elements' information.
class_n (int): Number of clusters wanted to be analyzed, default is 2.
exclude_id (tuple[int]): The list of excluded element's index, default is ().
classes (list[list[int]]): The list of clusters. Each cluster is a list of indices.
def _cal_mean(classes):
class_datas = [[data[cid] for cid in cls] for cls in classes]
return [sum(cls) / len(cls) if cls else float('inf') for cls in class_datas]
def _cal_distance(a, b):
return abs(a - b)
def _check_different(old_classes, new_classes):
for o, n in zip(old_classes, new_classes):
if o != n:
return True
return False
if len(data) < class_n:
return None
classes = []
for i, _ in enumerate(data):
if i in exclude_id:
if len(classes) >= class_n:
changed = True
while changed:
new_classes = [[] for cls in classes]
means = _cal_mean(classes)
for idx, d in enumerate(data):
if idx in exclude_id:
min_idx = -1
min_dis = float('inf')
for i, m in enumerate(means):
cur_dis = _cal_distance(m, d)
min_idx = i if min_dis > cur_dis else min_idx
min_dis = cur_dis if min_dis > cur_dis else min_dis
changed = _check_different(classes, new_classes)
classes = new_classes
return classes
def pipeline_fusion_analyze(blocks, op_sizes, exclude_id):
"""analyze whether the segments can be pipeline optimized"""
# op size first, block second.
def _simple_factor(block, op_size):
return block + 5 * op_size
def _take_second(elem):
return elem[1]
simple_indicators = [_simple_factor(b, s)
for b, s in zip(blocks, op_sizes)]
# 2 classes, one heavy, the other light
classes = ScheduleAnalyzer.k_mean(simple_indicators, 2, exclude_id)
if not classes:
return []
means = [sum([simple_indicators[idx] for idx in cls]) /
len(cls) if cls else float('inf') for cls in classes]
# The target two clusters should be a heavy one and a light one.
# The light one maybe suitable to run with pipeline optimized.
classes_infos = [[cls, m] for cls, m in zip(classes, means)]
pipeline_target = None
for ci in classes_infos:
if ci:
pipeline_target = ci
pipeline_gids, pipeline_mean = pipeline_target
if pipeline_mean > _simple_factor(float(ScheduleAnalyzer.MAX_SM) / len(blocks),
return []
pipeline_blocks = []
pipeline_weight = len(pipeline_gids)
# Try to make two paralleled at least.
if pipeline_weight > 3 and pipeline_weight > len(blocks) / 2:
if len(pipeline_gids[:pipeline_weight // 2]) > 1:
pipeline_blocks.append(pipeline_gids[:pipeline_weight // 2])
if len(pipeline_gids[pipeline_weight // 2:]) > 1:
pipeline_blocks.append(pipeline_gids[pipeline_weight // 2:])
elif pipeline_weight > 1:
return pipeline_blocks
def fusion_consult(blocks, op_sizes, exclude_gid):
"""get a recommendation for parallel fusion"""
# Default is block fusion
fusion_type = "block_fusion"
type_info = None
activate_pipeline_optimization = False # Disable pipeline optimization for now.
if activate_pipeline_optimization:
pipeline_info = ScheduleAnalyzer.pipeline_fusion_analyze(
blocks, op_sizes, exclude_gid)
if pipeline_info:
fusion_type = "block_pipeline_fusion"
type_info = pipeline_info
return fusion_type, type_info
def block_parallel_estimate(graphs):
"""estimate block parallel gain"""
sum_block, max_weight, sum_weight, blocks = 0, 0, 0, []
for g in graphs:
sum_block, max_weight, sum_weight, blocks, op_sizes, exclude_gid = 0, 0, 0, [], [], []
for gid, g in enumerate(graphs):
s = ScheduleAnalyzer(g)
sum_block += s.block_num
@ -144,9 +276,14 @@ def block_parallel_estimate(graphs):
max_weight = s.block_weight
sum_weight += s.block_weight
if not s.suitable_to_pipeline():
if sum_block > ScheduleAnalyzer.MAX_SM * 32:
return ParalGain("none", sum_weight, 0, [])
return ParalGain("block_fusion", max_weight, sum_weight - max_weight, blocks)
return ParalGain("none", sum_weight, 0, [0 for _ in graphs], None)
fusion_type, type_info = ScheduleAnalyzer.fusion_consult(blocks, op_sizes, tuple(exclude_gid))
return ParalGain(fusion_type, max_weight, sum_weight - max_weight, blocks, type_info)
def parallel_estimate(graphs):