Compare commits

..

1 Commits

Author SHA1 Message Date
caoqichun 223de81530 add version v_nnvm
6 years ago

@ -1,21 +1,30 @@
![Image text](https://gitee.com/inspur-inna/inspur-inna/raw/master/Image/inspur.png)
![Image text](https://github.com/inspur-inna/inspur-inna/blob/master/Image/inspur.png)
# 基于FPGA的CNN自适应映射技术——inna1.0
# 基于FPGA的CNN自适应映射技术---inspur-inna
基于宏指令的Look-Aside Acceleration框架
- 一键式快速部署
- 软硬件协同优化
- 支持多种卷积
- 执行过程无需主机干预
基于FPGA板卡设计深度学习加速器并进行优化在整体性能和功耗方面拟达到业界领先水平映射技术采用宏指令的Look-Aside Acceleration框架实现了一键式快速部署、软硬件协同优化、支持多种卷积、执行过程无需主机干预。本项目为映射技术的软件端拟实现CNN映射编译器和CNN量化器首先由TensorFlow产生的模型文件解析产生CNN的计算图模型CNN映射编译器会根据解析的计算图和现有的CNN加速库单元选择相应的CNN库单元生成相应的硬件结构和相应的调度器的配置参数以达到计算、片上存储、片上带宽和片外带宽的均衡从而达到最优的计算性能CNN量化器可根据模型的权重文件对各层数据进行8位定点量化以便于FPGA的DSP计算从而在保证精度的前提下降低存储开销提高处理速度降低功耗。
## Install
### inna install
TVM need LLVMLLVM install in Ubuntuother system require source code compilation
### TVM source code install
LLVM install in Ubuntu
```bash
apt search llvm
apt install llvm-6.0
apt install clang-6.0
```
Install miniconda for python=3.6install_inna.sh include TVM install scriptrefer to TVM <https://tvm.apache.org/docs/install/from_source.html>
TVM Install Source<https://tvm.apache.org/docs/install/from_source.html>
### inna install
Install miniconda for python=3.6
```bash
conda create -n inna python=3.6 ipykernel -y
conda activate inna

@ -10,18 +10,18 @@ import os
def resnet_v1_50():
with tf.Graph().as_default():
model = {
'pb': '../models/tensorflow/resnet/resnet50_without_bn_test.pb',
'pb': '../models/tensorflow/resnet/frozen_resnet_v1_50.pb',
'shape_dict': {
'input': (1, 224, 224, 3),
'input': (1, 224, 224, 3),
},
'layout': 'NHWC',
'out_node': 'prob',
'out_node': 'resnet_v1_50/predictions/Reshape_1',
}
with tf.gfile.GFile(model['pb'], 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
graph = tf.import_graph_def(graph_def, name='')
# Call the utility to import the graph definition into default graptributeError: module 'tensorflow.tools.api.generator.api.compat' has no attribute 'v1'h.
# Call the utility to import the graph definition into default graph.
graph_def = tf_testing.ProcessGraphDefParam(graph_def)
with tf.Session() as sess:

@ -30,8 +30,6 @@ SET_REG_HISTORY = {}
CHILDREN_HISTORY = {}
INSTR_CODE_DICT = {
'CONV': 0x000,
'SPLIT': 0x100,
'CONCAT': 0x101,
'ACTIVE': 0x107,
'ELTWISE': 0x108,
'MOVH': 0x200,
@ -44,13 +42,13 @@ INSTR_CODE_DICT = {
# register can be classified as | wait register | param regitster | set regitster |
def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings=[], batch_size=4, conv_type=0,
active_type=0, pool_type=0, eltwise_type=0, concat_channel=0, input_width=0, input_height=0, input_channel=0, input_mat_width=0,
active_type=0, pool_type=0, input_width=0, input_height=0, input_channel=0, input_mat_width=0,
input_mat_height=0, input_vect_width=0, input_vect_height=0, output_width=0, output_height=0,
output_channel=0, input_addr=0, filter_addr=0, quant_addr=0, residual_addr=0, output_addr=0,
data_addr=0, input_mat_addr=0, input_vect_addr=0, input_bias_addr=0, output_mat_addr=0,
h_pad=0, v_pad=0, stride=1, kernel_size=1, input_size=0, output_size=0, width_shift=0,
width_size=0, height_shift=0, height_size=0, channel_shift=0, channel_size=0, op='', name='',
attrs={}, shape=[], input_shape=[],output_shape=[]):
attrs={}, shape=[], input_shape=[]):
regs_params = []
input_feature_size4 = int(input_size / MAX_HW_BATCH)
output_feature_size4 = int(output_size / MAX_HW_BATCH)
@ -67,16 +65,8 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
reg3 = ((width_shift & 0xFFF) << 48) | (filter_addr & ADDR_MASK48)
reg4 = ((width_size & 0xFFF) << 48) | (quant_addr & ADDR_MASK48)
'''
#before 20191128, add activate into conv
#reg5 = output_addr & ADDR_MASK48
#reg6 = ((conv_type & 0x7) << 61) | ((kernel_size & 0xF) << 57) | ((h_pad & 0x7) << 54) | (
# (v_pad & 0x7) << 51) | ((stride & 0x7) << 48) | ((input_width & 0xFFF) << 36) | (
# (input_height & 0xFFF) << 24) | ((input_channel & 0xFFF) << 12) | (output_channel & 0xFFF)
'''
reg5 = ((active_type & 0x7) << 52) | ((conv_type & 0xf) << 48) | (output_addr & ADDR_MASK48)
reg6 = ((kernel_size & 0xF) << 57) | ((h_pad & 0x7) << 54) | (
reg5 = output_addr & ADDR_MASK48
reg6 = ((conv_type & 0x7) << 61) | ((kernel_size & 0xF) << 57) | ((h_pad & 0x7) << 54) | (
(v_pad & 0x7) << 51) | ((stride & 0x7) << 48) | ((input_width & 0xFFF) << 36) | (
(input_height & 0xFFF) << 24) | ((input_channel & 0xFFF) << 12) | (output_channel & 0xFFF)
reg7 = ((height_shift & 0xFFF) << 36) | ((height_size & 0xFFF) << 24) | ((channel_shift & 0xFFF) << 12) | (
@ -87,7 +77,7 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
runid, instr_name, reg2, reg3, reg4, reg5, reg6, reg7)
regs_params.append([reg2, reg3, reg4, reg5, reg6, reg7])
elif instr_name in ['ACTIVE', 'ATTACH', 'RESHAPE', 'SPLIT']:
elif instr_name in ['ACTIVE', 'ATTACH', 'CONCAT', 'RESHAPE', 'SPLIT']:
for _ in range(MAX_HW_BATCH):
reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
reg3 = ((active_type & 0x7) << 48) | (output_addr & ADDR_MASK48)
@ -103,22 +93,12 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
output_addr += output_feature_size4
regs_params.append([reg2, reg3, reg4, reg5, reg6, reg7])
elif instr_name in ['CONCAT']:
for _ in range(MAX_HW_BATCH):
reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
reg3 = (output_addr & ADDR_MASK48)
reg4 = ((concat_channel & 0xFFF) << 48) | ((input_width & 0xFFF) << 36) | ((input_height & 0xFFF) << 24) | ((input_channel & 0xFFF) << 12)
logger.info('generate register parameters %d %s reg2=%#x reg3=%#x', runid, instr_name, reg2, reg3, reg4)
input_addr += input_feature_size4
output_addr += input_feature_size4 + output_feature_size4
regs_params.append([reg2, reg3, reg4])
elif instr_name in ['ELTWISE']:
for _ in range(MAX_HW_BATCH):
reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
reg3 = ((input_width & 0xFFF) << 48) | (quant_addr & ADDR_MASK48)
reg4 = ((eltwise_type & 0x3) << 60) | ((input_height & 0xFFF) << 48) | (residual_addr & ADDR_MASK48)
reg5 = ((active_type & 0x3) << 60) | ((input_channel & 0xFFF) << 48) | (output_addr & ADDR_MASK48)
reg4 = ((input_height & 0xFFF) << 48) | (residual_addr & ADDR_MASK48)
reg5 = ((input_channel & 0xFFF) << 48) | (output_addr & ADDR_MASK48)
logger.info('generate register parameters %d %s reg2=%#x reg3=%#x reg4=%#x reg5=%#x',
runid, instr_name, reg2, reg3, reg4, reg5)
input_addr += input_feature_size4
@ -146,12 +126,10 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
return regs_params
def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
if instr_name in ['CONV', 'ACTIVE', 'ATTACH', 'RESHAPE', 'SPLIT']:
def _allocate_physical_regs(runid, instr_name, parents, children):
if instr_name in ['CONV', 'ACTIVE', 'ATTACH', 'CONCAT', 'RESHAPE', 'SPLIT']:
reg_num = 8
elif instr_name in ['ELTWISE']:
reg_num = 7
elif instr_name in ['POOL', 'CONCAT']:
elif instr_name in ['ELTWISE', 'POOL']:
reg_num = 6
else:
print('Error: {} not support!'.format(instr_name))
@ -174,7 +152,7 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
sys.exit(1)
batchi_regs = [0 for _ in range(reg_num)]
num = 1 if instr_name not in ['ELTWISE', 'CONCAT'] else 2
num = 1
regid = 0
needed_reg_num = reg_num - 1 if len(children) == 0 else reg_num
while num < needed_reg_num:
@ -186,34 +164,6 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
regid += 1
# set wait register
if len(parents) > 0:
# max_parent_id = max(parents)
for i, parent_id in enumerate(parents):
if parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[parent_id]):
wait_regid = SET_REG_HISTORY[parent_id][batchi]
batchi_regs[i] = wait_regid
if len(parents) > 1 and type == 1:
max_parent_id = max(parents)
for i, parent_id in enumerate(parents):
if parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[parent_id]):
wait_regid = SET_REG_HISTORY[max_parent_id][batchi]
batchi_regs[i] = wait_regid
if len(parents) > 0:
max_parent_id = max(parents)
if max_parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[max_parent_id]):
# free parents' set regitster
max_brother_id = max(CHILDREN_HISTORY[max_parent_id])
if runid == max_brother_id:
for parent_id in parents:
parent_regid = SET_REG_HISTORY[parent_id][batchi]
REG_INUSED[parent_regid] = False
FREE_REG_NUM += 1
if batchi == MAX_HW_BATCH - 1:
del SET_REG_HISTORY[parent_id]
del CHILDREN_HISTORY[parent_id]
'''
if len(parents) > 0:
max_parent_id = max(parents)
if max_parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[max_parent_id]):
@ -230,9 +180,8 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
if batchi == MAX_HW_BATCH - 1:
del SET_REG_HISTORY[parent_id]
del CHILDREN_HISTORY[parent_id]
'''
PARAM_REG_HISTORY = batchi_regs[1:-1] if instr_name not in ['ELTWISE', 'CONCAT'] else batchi_regs[2:-1]
PARAM_REG_HISTORY = batchi_regs[1:-1]
set_regs.append(batchi_regs[-1])
regs.append(batchi_regs)
@ -248,19 +197,11 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
def _pseudo_to_assems(instr_name, regs, reg_params):
assems = []
reg_param_num = len(reg_params)
if instr_name in ['ELTWISE', 'CONCAT']:
for i in range(reg_param_num):
assem = ['MOVH', regs[i + 2], reg_params[i] >> 32]
assems.append(assem)
assem = ['MOVL', regs[i + 2], reg_params[i] & 0xFFFFFFFF]
assems.append(assem)
else:
for i in range(reg_param_num):
assem = ['MOVH', regs[i + 1], reg_params[i] >> 32]
assems.append(assem)
assem = ['MOVL', regs[i + 1], reg_params[i] & 0xFFFFFFFF]
assems.append(assem)
for i in range(reg_param_num):
assem = ['MOVH', regs[i + 1], reg_params[i] >> 32]
assems.append(assem)
assem = ['MOVL', regs[i + 1], reg_params[i] & 0xFFFFFFFF]
assems.append(assem)
assems.append([instr_name] + regs)
return assems
@ -283,10 +224,8 @@ def _assem_to_machine_code(assem, batchi):
def _pseudo_to_machine_code(pseudo):
print(pseudo)
regs_params = _generate_regs_params(**pseudo)
type = (pseudo['eltwise_type'] & 0x2 >> 1) if pseudo['instr_name'] in ['ELTWISE'] else 0
regs = _allocate_physical_regs(pseudo['runid'], pseudo['instr_name'], pseudo['parents'], pseudo['children'], type)
regs = _allocate_physical_regs(pseudo['runid'], pseudo['instr_name'], pseudo['parents'], pseudo['children'])
machine_codes = []
for batchi in range(MAX_HW_BATCH):
batchi_reg_params = regs_params[batchi]

@ -7,7 +7,6 @@ from inna.compiler import converter
from inna.compiler import scheduler
from inna.compiler import assembler
import tvm.relay
class INNACompiler:
"""Compiler Class"""
@ -42,9 +41,9 @@ class INNACompiler:
instruction stream represented by np.NDArray
"""
assert layout in ['NCHW', 'NHWC']
tvm_graph, params = frontend.to_tvm(graph, shape_dict, layout, self._mode)
hwgraph = converter.convert(tvm_graph, shape_dict, layout)
nnvm_graph, params = frontend.to_nnvm(graph, shape_dict, layout, self._mode)
jsong = nnvm_graph.json()
hwgraph = converter.convert(nnvm_graph, shape_dict, layout)
scheduler.schedule(hwgraph)
instr_streams = assembler.assemble(hwgraph)
return instr_streams
@ -99,3 +98,4 @@ if __name__ == '__main__':
outfile_name = 'out/resnet_v1_50_tensorflow.bin'
with open(outfile_name, 'wb') as f:
f.write(instr_streams.tobytes())

@ -4,55 +4,19 @@ from __future__ import print_function
def _parse_inputs(graph):
shape_list = graph['attrs']['shape'][1]
for i, node in enumerate(graph['nodes']):
#print(node)
for node in graph.index.nodes:
inputs = node['inputs']
node['shape'] = []
if len(inputs) > 0:
node['inputs'] = [inp[0] for inp in node['inputs']]
node['shape'] = [shape_list[j] for j in node['inputs']]
node['shape'].append(shape_list[i])
def _remove_null_op_inputs(graph, shape_dict):
for node in graph['nodes']:
index = graph.index
for node in index.nodes:
if node['name'] in shape_dict:
node['op'] = 'input'
if node['op'] != 'null':
node['inputs'] = [inp for inp in node['inputs'] if graph['nodes'][inp]['op'] != 'null']
def _parse_op(graph):
for node in graph['nodes']:
if node['name'] in ['data', 'input']:
node['op'] = node['name']
continue
if node['op'] in ['null']:
continue
#print(node)
name_str = node['attrs']['func_name']
name_list = name_str.split('_')
node['op'] = ''
for op_name in name_list:
if (op_name not in ['fused','nn']) and (op_name.isdigit()==False):
node['op'] = node['op'] + op_name + '_'
node['op'] = node['op'].strip('_')
if node['op'] in ['add']:
if len(node['inputs']) > 1:
i = 0
for j in node['inputs']:
if graph['nodes'][j]['op'] == 'null':
break
i += 1
if i == len(node['inputs']):
node['op'] = 'eltwise_add'
if node['op'] in ['transpose']:
node['op'] = 'null'
#if node['op'] in ['pad1','pad2','pad3','pad4','pad5']:
#node['op'] = 'pad'
#if node['op'] in ['relu1','relu2']:
#node['op'] = 'relu'
node['inputs'] = [inp for inp in node['inputs'] if index.nodes[inp]['op'] != 'null']
def _get_pad_width(pad_width, layout):
@ -63,38 +27,35 @@ def _fuse_ops(graph, shape_dict, layout):
hardware_graph = []
new_index = 0
index_map = {}
index = graph.index
# deal with batch_norm、pad
for node in graph['nodes']:
for node in index.nodes:
if node['op'] == 'conv2d':
node['attrs']['batch_norm'] = False
node['attrs']['use_bias'] = False
if node['op'] == 'batch_norm':
graph['nodes'][node['inputs'][0]]['attrs']['batch_norm'] = True
if node['op'] == 'add':
graph['nodes'][node['inputs'][0]]['attrs']['use_bias'] = True
input_nodes = [graph['nodes'][inp] for inp in node['inputs']]
index.nodes[node['inputs'][0]]['attrs']['batch_norm'] = True
input_nodes = [index.nodes[inp] for inp in node['inputs']]
for i, inp_node in enumerate(input_nodes):
#print(inp_node)
if inp_node['op'] in ['batch_norm', 'pad', 'add']:
if inp_node['op'] in ['batch_norm', 'pad']:
node['inputs'][i] = inp_node['inputs'][0]
if inp_node['op'] == '__mul_scalar__':
if len(inp_node['inputs']) == 0:
del node['inputs'][i]
else:
node['inputs'][i] = inp_node['inputs'][0]
#if inp_node['op'] == 'pad':
#padding = _get_pad_width(inp_node['attrs']['pad_width'], layout)
#node['attrs']['padding'] = padding
if inp_node['op'] == 'pad':
padding = _get_pad_width(inp_node['attrs']['pad_width'], layout)
node['attrs']['padding'] = padding
# remove null、batch_norm、pad op
for i, node in enumerate(graph['nodes']):
if node['op'] not in ['null', 'batch_norm', 'pad', '__mul_scalar__', 'add', 'transpose']:
for i, node in enumerate(index.nodes):
if node['op'] not in ['null', 'batch_norm', 'pad', '__mul_scalar__']:
hardware_graph.append(node)
index_map[i] = new_index
new_index += 1
# reset node inputs
for node in hardware_graph:
inputs = [index_map[inp] for inp in node['inputs']] if node['op'] not in ['input','data'] else []
inputs = [index_map[inp] for inp in node['inputs']] if node['op'] != 'input' else []
node['inputs'] = inputs
return hardware_graph
@ -104,8 +65,7 @@ def _layout_to_nchw(shape, layout):
return shape
return [shape[0], shape[3], shape[1], shape[2]] if layout == 'NHWC' else shape
def _parse_shape_to_nchw(hwgraph, shape_dict, layout):
'''
def _parse_shape_to_hchw(hwgraph, shape_dict, layout):
for node in hwgraph:
node['shape'] = []
node_name = node['name']
@ -120,12 +80,6 @@ def _parse_shape_to_nchw(hwgraph, shape_dict, layout):
node['shape'] = _layout_to_nchw(attrs['shape'], layout)
elif '__shape__' in node['attrs']:
node['shape'] = _layout_to_nchw(attrs['__shape__'], layout)
'''
for node in hwgraph:
if 'input_shape' in node:
node['input_shape'] = _layout_to_nchw(node['input_shape'], layout)
if 'output_shape' in node:
node['output_shape'] = _layout_to_nchw(node['output_shape'], layout)
def _clac_shape(in_shape, kernel, pad, stride, channels):
@ -135,75 +89,48 @@ def _clac_shape(in_shape, kernel, pad, stride, channels):
def _infer_shape(hwgraph):
for i, node in enumerate(hwgraph):
if node['op'] in ['input', 'data']:
continue
if len(node['inputs']) < 1:
if node['op'] == 'input':
continue
first_input = node['inputs'][0]
input_shape = hwgraph[first_input]['shape'][-1]
input_shape = hwgraph[first_input]['shape']
node['input_shape'] = input_shape
output_shape = node['shape'][-1]
node['output_shape'] = output_shape
if node['op'] == 'conv2d':
attrs = node['attrs']
assert len(input_shape) > 0
kernel_shape = node['shape'][1]
attrs['kernel_size'] = [kernel_shape[0],kernel_shape[1]]
attrs['channels'] = kernel_shape[3]
attrs['padding'] = [int(kernel_shape[0]/2), int(kernel_shape[1]/2)]
output_shape = node['shape'][-1]
attrs['strides'] = [int(input_shape[1]/output_shape[1]), int(input_shape[2]/output_shape[2])]
#node['shape'] = _clac_shape(input_shape, attrs['kernel_size'],
#attrs.get('padding', [0, 0]), attrs['strides'], attrs['channels'])
node['shape'] = _clac_shape(input_shape, attrs['kernel_size'],
attrs.get('padding', [0, 0]), attrs['strides'], attrs['channels'])
elif node['op'] in ['max_pool2d', 'avg_pool2d']:
attrs = node['attrs']
#padding = attrs.get('padding', [0, 0])
if node['op'] in ['max_pool2d']:
attrs['pool_size'] = [3, 3]
else:
attrs['pool_size'] = [7, 7]
attrs['padding'] = [0, 0, int(attrs['pool_size'][0]/2), int(attrs['pool_size'][1]/2)]
attrs['strides'] = [int(output_shape[1]/input_shape[1]), int(output_shape[2]/input_shape[2])]
#node['shape'] = _clac_shape(input_shape, attrs['pool_size'],
#attrs['padding'], attrs['strides'], input_shape[1])
padding = attrs.get('padding', [0, 0])
node['shape'] = _clac_shape(input_shape, attrs['pool_size'],
padding, attrs['strides'], input_shape[1])
elif node['op'] in ['global_avg_pool2d', 'mean']:
#node['output_shape'] = [input_shape[0], input_shape[1], 1, 1]
continue
node['shape'] = [input_shape[0], input_shape[1], 1, 1]
elif node['op'] == 'flatten':
#node['output_shape'] = input_shape[:2]
continue
node['shape'] = input_shape[:2]
elif node['op'] == 'dense':
# TODO
if len(input_shape) == 0:
#node['output_shape'] = [1, node['attrs']['units']]
node['shape'] = [1, node['attrs']['units']]
continue
#node['shape'] = [input_shape[0], node['attrs']['units']]
elif node['op'] in ['pad', 'batch_norm', 'relu', 'eletwise_add', 'broadcast_add', 'softmax']:
#node['shape'] = input_shape
continue
node['shape'] = [input_shape[0], node['attrs']['units']]
elif node['op'] in ['pad', 'batch_norm', 'relu', 'elemwise_add', 'broadcast_add', 'softmax']:
node['shape'] = input_shape
# remove input node
new_hwgraph = []
index_map = {}
new_index = 0
for i, node in enumerate(hwgraph):
if node['op'] not in ['input', 'data']:
if node['op'] != 'input':
new_hwgraph.append(node)
index_map[i] = new_index
new_index += 1
# reset node inputs
for i, node in enumerate(new_hwgraph):
if len(node['inputs']) < 1:
continue
if hwgraph[node['inputs'][0]]['op'] in ['input', 'data']:
if hwgraph[node['inputs'][0]]['op'] == 'input':
inputs = []
else:
inputs = [index_map[inp] for inp in node['inputs']]
@ -215,16 +142,12 @@ def convert(graph, shape_dict, layout):
"""convert nnvm graph to hardware supported graph"""
_parse_inputs(graph)
_parse_op(graph)
_remove_null_op_inputs(graph, shape_dict)
hwgraph = _fuse_ops(graph, shape_dict, layout)
hwgraph = _infer_shape(hwgraph)
_parse_shape_to_hchw(hwgraph, shape_dict, layout)
_parse_shape_to_nchw(hwgraph, shape_dict, layout)
#print(hwgraph)
hwgraph = _infer_shape(hwgraph)
return hwgraph
return hwgraph

@ -2,48 +2,54 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tvm
import tvm.relay
import nnvm
import nnvm.graph
from nnvm.compiler import graph_attr, graph_util
import re
import json
FRAME_SUPPORTED = ('tensorflow', 'keras', 'mxnet', 'onnx')
ATTR_NEED_CONVERT = ("num_outputs", "num_inputs", "flatten_data")
ATTR_NEED_CONVERT = ('channels', 'dilation', 'kernel_size',
'padding', 'strides', 'pool_size',
'axis', 'pad_value', 'pad_width',
'__shape__', 'shape', 'groups')
def _str_to_list(src):
matchs = re.findall(r'([\[|\(]-*?\d+.*?[\]|\)])', src)
ret = []
for match in matchs:
sub_matchs = re.findall(r'(-*?\d+)', match)
ret.append([i for i in map(int, sub_matchs)])
if len(ret) == 0:
ret = int(src)
elif len(ret) == 1:
ret = ret[0]
return ret
def _attrstr_to_number(graph):
nodes = graph['nodes']
for node in nodes:
index = graph.index
for node in index.nodes:
if 'attrs' not in node:
continue
for attr_name, attr_val in node['attrs'].items():
if attr_name in ATTR_NEED_CONVERT:
node['attrs'][attr_name] = int(attr_val)
node['attrs'][attr_name] = _str_to_list(attr_val)
def to_tvm(graph, shape_dict, layout, mode='tensorflow'):
def to_nnvm(graph, shape_dict, layout, mode='tensorflow'):
"""convert frontend graph to nnvm graph"""
assert mode in FRAME_SUPPORTED
if mode == 'tensorflow':
mod, params = tvm.relay.frontend.from_tensorflow(graph, layout=layout, shape=shape_dict)
sym, params = nnvm.frontend.from_tensorflow(graph, layout=layout, shape=shape_dict)
elif mode == 'keras':
mod, params = tvm.relay.frontend.from_keras(graph)
sym, params = nnvm.frontend.from_keras(graph)
elif mode == 'mxnet':
mod, params = tvm.relay.frontend.from_mxnet(graph)
sym, params = nnvm.frontend.from_mxnet(graph)
sym = nnvm.sym.softmax(sym)
else:
mod, params = tvm.relay.frontend.from_onnx(graph)
mod = tvm.relay.transform.InferType()(mod)
target = 'llvm'
target_host = 'llvm'
with tvm.relay.build_config(opt_level=0):
tvm_graph_json, lib, params = tvm.relay.build(mod, target=target, target_host=target_host, params=params)
sym, params = nnvm.frontend.from_onnx(graph)
nnvm_graph = nnvm.graph.create(sym)
#with open("./json/resnet_v1_50_tvm_0.json", 'w') as fp:
#fp.write(tvm_graph)
tvm_graph = json.loads(tvm_graph_json)
_attrstr_to_number(tvm_graph)
_attrstr_to_number(nnvm_graph)
return tvm_graph, params
return nnvm_graph, params

@ -40,10 +40,9 @@ def _shape_to_size(shape, maxbatch, hwbatch):
def _calc_inout_size(graph):
for node in graph:
print(node)
node['input_size'] = _shape_to_size(node['input_shape'],
MAX_HW_BATCH, HW_BATCH_SIZE)
node['output_size'] = _shape_to_size(node['output_shape'], MAX_HW_BATCH, HW_BATCH_SIZE)
node['output_size'] = _shape_to_size(node['shape'], MAX_HW_BATCH, HW_BATCH_SIZE)
def _alloc_param_memory(graph):
@ -53,9 +52,9 @@ def _alloc_param_memory(graph):
if node['op'] == 'conv2d':
node['filter_addr'] = filter_addr
attrs = node['attrs']
filter_addr += _aligned(attrs['channels'], 64) * _aligned(node['output_shape'][1], 64) \
filter_addr += _aligned(attrs['channels'], 64) * _aligned(node['shape'][1], 64) \
* attrs['kernel_size'][0] * attrs['kernel_size'][1]
if node['op'] in ['conv2d', 'relu', 'max_pool2d', 'eltwise_add',
if node['op'] in ['conv2d', 'relu', 'max_pool2d', 'elemwise_add',
'boardcast_add', 'mean', 'global_avg_pool2d']:
node['quant_addr'] = quant_addr
quant_addr += 256 * 2
@ -69,7 +68,7 @@ def _alloc_feature_memory(graph):
feature_addr[iotype] += node['input_size']
node['output_addr'] = feature_addr[1 - iotype]
feature_addr[1 - iotype] += node['output_size']
if node['op'] == 'eltwise_add':
if node['op'] == 'elemwise_add':
node['residual_addr'] = graph[node['parents'][1]]['output_addr']
def _alloc_memory(graph):
@ -91,8 +90,8 @@ def _convert_to_assem_graph(graph):
node['batch_size'] = HW_BATCH_SIZE
_, node['input_channel'], node['input_height'], node['input_width'] = node['input_shape'] \
if len(node['input_shape']) == 4 else node['input_shape'] + [1, 1]
_, node['output_channel'], node['output_height'], node['output_width'] = node['output_shape'] \
if len(node['output_shape']) == 4 else node['output_shape'] + [1, 1]
_, node['output_channel'], node['output_height'], node['output_width'] = node['shape'] \
if len(node['shape']) == 4 else node['shape'] + [1, 1]
if node['op'] == 'conv2d':
node['instr_name'] = 'CONV'
attrs = node['attrs']
@ -111,13 +110,8 @@ def _convert_to_assem_graph(graph):
attrs = node['attrs']
node['kernel_size'] = attrs['pool_size'][0]
node['stride'] = attrs['strides'][0]
elif node['op'] in ['eltwise_add', 'broadcast_add']:
elif node['op'] in ['elemwise_add', 'broadcast_add']:
node['instr_name'] = 'ELTWISE'
runid = min(node['parents'])
if graph[runid]['parents'] >= graph[runid+1]['parents']:
node['eltwise_type'] = 1
else:
node['eltwise_type'] = 2
elif node['op'] in ['global_avg_pool2d', 'mean', 'avg_pool2d']:
node['instr_name'] = 'POOL'
node['pool_type'] = 1

@ -4,9 +4,6 @@ max_hw_batch=1
instrs_addr=0x0
weights_addr=0x220000000
quant_addr=0x210000000
input_feature_addr=0x1000000
output_feature_addr=0xc000000
per_output_feature_size=2048
fullconv_weight_addr=0x168d000
fullconv_quant_addr=0x1ac40
class_num=1000
input_feature_addr=0x100000
output_feature_addr=0x100000000
per_output_feature_size=100

@ -4,13 +4,13 @@ from __future__ import print_function
import os
import cv2
import numpy as np
import configparser
import numpy as np
import math
from inna.runtime import _runtime
import argparse
import pdb
def check_equal(tname, test, real):
"""
@ -43,38 +43,6 @@ def _softmax(x):
return ex / np.sum(ex, axis=1)
def _fullconv(x, weights, weights_addr, quant, quant_addr, class_num):
#pdb.set_trace()
x = x.astype(np.int8)
np_weight = weights[weights_addr:]
np_weight = np_weight.astype(np.int8)
weight_size = np_weight.shape[0]
np_quant = quant[quant_addr:]
quant_table = np_quant[0]
np_quant_int32 = np.fromstring(np_quant[64:].tostring(), dtype=np.int32)
np_quant_int32_sort = np_quant_int32[:class_num]
np_weight = np_weight.reshape([class_num, int(weight_size/class_num)])
x.tofile("x.bin")
np_weight.tofile("weight.bin")
np_quant.tofile("bias.bin")
np_weight = np_weight.astype(np.int32)
x = x.astype(np.int32)
y = np.zeros((x.shape[0], class_num), dtype=np.float)
for k in range(x.shape[0]):
temp = np.dot(np_weight, x[k])
temp = temp + np_quant_int32_sort
temp = temp / (2 ** quant_table)
y[k] = temp
#print(" fullconv y=")
#print(y)
return y
def _feature_reshape(data, layout='CHW'):
assert layout in ['CHW', 'HWC']
if layout == 'CHW':
@ -88,11 +56,7 @@ def _feature_reshape(data, layout='CHW'):
if end > c:
end = c
cnum = end - start
#outdata[oci, :, :, 0: cnum] = data[:, :, start: end]
for i in range(cnum):
outdata[oci, :, :, 63-i] = data[:, :, i]
outdata[oci, :, :, 0: cnum] = data[:, :, start: end]
return outdata
@ -122,13 +86,7 @@ def _load_images(img_dir, batch_size):
for filename in os.listdir(img_dir):
if filename.endswith('.JPEG'):
img = cv2.imread(img_dir + filename)
img = cv2.resize(img, (224, 224)) #, interpolation=cv2.INTER_CUBIC)
#print(img.shape)
img = img.astype(np.float)
img[..., 0] = (img[..., 0] - 103.939) / 2.0
img[..., 1] = (img[..., 1] - 116.779) / 2.0
img[..., 2] = (img[..., 2] - 123.68) / 2.0
img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
img = _feature_reshape(img, 'HWC')
imgs.append(img)
cur_img += 1
@ -147,7 +105,6 @@ def _aligned64(arr):
arr = np.concatenate((arr, pad_data), axis=0)
return arr
def _get_addrs():
instrs_addr = int(config.get('config', 'instrs_addr'), 16)
assert instrs_addr % 64 == 0
@ -157,7 +114,6 @@ def _get_addrs():
assert quant_addr % 64 == 0
return instrs_addr, weights_addr, quant_addr
def _create_model_datainfo(instrs, weights, quant_table):
instrs_addr, weights_addr, quant_addr = _get_addrs()
instrs = _aligned64(instrs)
@ -165,7 +121,6 @@ def _create_model_datainfo(instrs, weights, quant_table):
quant_table = _aligned64(quant_table)
return [instrs_addr, instrs, weights_addr, weights, quant_addr, quant_table]
class INNARuntime(_runtime.INNARuntime):
def __init__(self, instrs, weights, quant_table):
"""
@ -182,11 +137,6 @@ class INNARuntime(_runtime.INNARuntime):
self._ifeature_addr = int(config.get('config', 'input_feature_addr'), 16)
self._ofeature_addr = int(config.get('config', 'output_feature_addr'), 16)
self._ofeature_size = config.getint('config', 'per_output_feature_size')
self._fullconv_weight_addr = int(config.get('config', 'fullconv_weight_addr'), 16)
self._fullconv_quant_addr = int(config.get('config', 'fullconv_quant_addr'), 16)
self._class_num = config.getint('config', 'class_num')
self._weights = weights
self._quant = quant_table
self._model = _create_model_datainfo(instrs, weights, quant_table)
self.ReloadNNModel(*self._model)
@ -205,7 +155,7 @@ class INNARuntime(_runtime.INNARuntime):
result of neural network model
"""
batchs = _load_images(img_dir, batch_size)
outputs = np.zeros((1, self._class_num), dtype=np.uint8)
outputs = np.zeros((1, self._ofeature_size), dtype=np.uin8)
for batchi, batch_imgs in batchs:
print(batchi, batch_imgs.shape)
hwalign = batch_imgs.shape[0] % HW_BATCH_SIZE
@ -216,35 +166,15 @@ class INNARuntime(_runtime.INNARuntime):
max_img_range = min((i + 1) * MAX_BATCH_SIZE, batch_imgs.shape[0])
hw_batch_imgs = batch_imgs[i * MAX_BATCH_SIZE: max_img_range]
print(hw_batch_imgs.shape, int(hw_batch_imgs.shape[0] / HW_BATCH_SIZE))
hw_batch_imgs.tofile("input_data")
self.SetExtendHWBatch(int(hw_batch_imgs.shape[0] / HW_BATCH_SIZE))
self.SetInputFeatures(self._ifeature_addr, hw_batch_imgs.reshape(-1))
self.Run()
self.Wait()
#####
'''
ofeatures_addr = 0x1000000
ofeatures_size = 0x1000000
for j in range(119):
ofeatures_addr = ofeatures_addr + 0x1000000
if ofeatures_addr == 0xc000000:
ofeatures_addr = ofeatures_addr + 0x1000000
ofeatures = self.GetOutputFeatures(ofeatures_addr,hw_batch_imgs.shape[0] * ofeatures_size)
filename = os.path.dirname(os.path.abspath(__file__)) + '/mid_output/runid_' + str(j)
with open(filename, 'wb') as fp:
fp.write(ofeatures.tostring())
'''
#####
ofeatures = self.GetOutputFeatures(self._ofeature_addr,
hw_batch_imgs.shape[0] * self._ofeature_size)
ofeatures = self.GetOutputFeatures(self._ofeature_addr,
hw_batch_img.shape[0] * self._ofeature_size)
ofeatures = ofeatures.reshape(hw_batch_imgs.shape[0], -1)
#print(ofeatures)
#pdb.set_trace()
ofeatures = _fullconv(ofeatures, self._weights, self._fullconv_weight_addr, self._quant, self._fullconv_quant_addr, self._class_num)
ofeatures = _softmax(ofeatures)
outputs = np.row_stack((outputs, ofeatures))
ofeatures = _softmax(ofeature)
outputs = np.row_stack(outputs, ofeatures)
return outputs[1:]
@ -271,51 +201,7 @@ def create(instrs, weights, quant_table):
if __name__ == '__main__':
#from inna import runtime
#aa = np.array([124, 125], dtype=np.uint8)
#runtime = runtime.create(aa, aa, aa)
#runtime.run('images', 10)
parser = argparse.ArgumentParser()
parser.add_argument("path", default='~/resnet50_param/', help="input file path")
args = parser.parse_args()
basepath = args.path
instrs = np.fromfile(basepath+'instruction_conv.bin', dtype=np.uint8)
filters = np.fromfile(basepath + 'filter.bin', dtype=np.uint8)
quant_table = np.fromfile(basepath + 'quant.bin', dtype=np.uint8)
with open(basepath+'val.txt', 'r') as fp:
vtable = fp.read().split('\n')
vtable = [int(data.split(' ')[1]) for data in vtable if data!='']
#runtime = runtime.create(instrs, filters, quant_table)
runtime = create(instrs, filters, quant_table)
output = runtime.run('images', 1)
top1 = 0
top5 = 0
for i in range(int(output.shape[0])):
list_a = output[i].tolist()
max_v = max(list_a)
max_v_index = list_a.index(max(list_a))
print(i, end=" ")
if vtable[i] == max_v_index:
top1 += 1
top5 += 1
print("top1", max_v_index, max_v)
continue
else:
for j in range(4):
list_a[max_v_index] = 0
max_v = max(list_a)
max_v_index = list_a.index(max(list_a))
if vtable[i] == max_v_index:
top5 += 1
print("top5", max_v_index, max_v)
break
print("")
print("top1=",top1)
print("top5=",top5)
from inna import runtime
aa = np.array([124, 125], dtype=np.uint8)
runtime = runtime.create(aa, aa, aa)
runtime.run('images', 10)

@ -3,7 +3,7 @@ numpy
decorator
attrs
opencv-python
tensorflow>=1.14.0
tensorflow>=1.8.0
keras
mxnet
torch

@ -1,6 +1,13 @@
[
{"runid": 0, "instr_name": "CONV", "parents": [], "children": [1], "siblings": [], "batch_size": 1, "conv_type": 0, "kernel_size": 7, "h_pad": 3, "v_pad": 3, "stride": 2, "input_width": 224, "input_height": 224, "input_channel": 3, "output_channel": 64, "input_addr": "0x400000", "filter_addr": "0x500000", "quant_addr": "0x600000", "output_addr": "0x700000", "input_size": 3211264, "output_size": 802816, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
{"runid": 1, "instr_name": "ACTIVE", "parents": [0], "children": [2], "siblings": [], "batch_size": 1, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 112, "output_height": 112, "output_channel": 64, "input_addr": "0x700000", "quant_addr": "0x600000", "output_addr": "0x400000", "input_size": 802816, "output_size": 802816},
{"runid": 2, "instr_name": "POOL", "parents": [1], "children": [3], "siblings": [], "batch_size": 1, "pool_type": 3, "kernel_size": 3, "stride": 2, "input_width": 112, "input_height": 112, "input_channel": 64, "input_addr": "0x400000", "quant_addr": "0x600000", "output_addr": "0x700000", "input_size": 802816, "output_size": 200704},
{"runid": 3, "instr_name": "ELTWISE", "parents": [1, 2], "children": [], "siblings": [], "batch_size": 1, "input_width": 56, "input_height": 56, "input_channel": 256, "input_addr": "0x400000", "residual_addr": "0x800000", "quant_addr": "0x600100", "output_addr": "0x700000", "input_size": 802816, "output_size": 802816}
{"op": "null", "runid": 0, "instr_name": "CONV", "parents": [], "children": [1], "siblings": [], "batch_size": 4, "conv_type": 6, "kernel_size": 7, "h_pad": 3, "v_pad": 3, "stride": 2, "input_width": 224, "input_height": 224, "input_channel": 3, "output_channel": 64, "input_addr": "0x1", "filter_addr": "0x2", "quant_addr": "0x3", "output_addr": "0x4", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
{"op": "null", "runid": 1, "instr_name": "ACTIVE", "parents": [0], "children": [2], "siblings": [], "batch_size": 3, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 0, "output_height": 0, "output_channel": 0, "input_addr": "0x6", "quant_addr": "0x8", "output_addr": "0x7", "input_size": 8, "output_size": 8},
{"op": "null", "runid": 2, "instr_name": "POOL", "parents": [1], "children": [3, 4], "siblings": [], "batch_size": 2, "pool_type": 3, "kernel_size": 5, "stride": 1, "input_width": 57, "input_height": 58, "input_channel": 59, "input_addr": "0x81", "quant_addr": "0x8", "output_addr": "0x91", "input_size": 8, "output_size": 8},
{"op": "null", "runid": 3, "instr_name": "CONV", "parents": [2], "children": [9], "siblings": [4], "batch_size": 4, "conv_type": 1, "kernel_size": 1, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x10", "filter_addr": "0x11", "quant_addr": "0x12", "output_addr": "0x13", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
{"op": "null", "runid": 4, "instr_name": "CONV", "parents": [2], "children": [5], "siblings": [3], "batch_size": 3, "conv_type": 6, "kernel_size": 1, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x14", "filter_addr": "0x15", "quant_addr": "0x16", "output_addr": "0x17", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
{"op": "null", "runid": 5, "instr_name": "ACTIVE", "parents": [4], "children": [6], "siblings": [], "batch_size": 4, "active_type": 2, "input_width": 53, "input_height": 54, "input_channel": 55, "output_width": 1, "output_height": 2, "output_channel": 3, "input_addr": "0x54", "quant_addr": "0x8", "output_addr": "0x55", "input_size": 8, "output_size": 8},
{"op": "null", "runid": 6, "instr_name": "CONV", "parents": [5], "children": [7], "siblings": [], "batch_size": 4, "conv_type": 6, "kernel_size": 3, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x18", "filter_addr": "0x19", "quant_addr": "0x20", "output_addr": "0x21", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
{"op": "null", "runid": 7, "instr_name": "ACTIVE", "parents": [6], "children": [8], "siblings": [], "batch_size": 4, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 0, "output_height": 0, "output_channel": 0, "input_addr": "0x22", "quant_addr": "0x8", "output_addr": "0x23", "input_size": 25690112, "output_size": 25690112 },
{"op": "null", "runid": 8, "instr_name": "CONV", "parents": [7], "children": [9], "siblings": [], "batch_size": 4, "conv_type": 6, "kernel_size": 1, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x24", "filter_addr": "0x25", "quant_addr": "0x26", "output_addr": "0x27", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
{"op": "null", "runid": 9, "instr_name": "ELTWISE", "parents": [3, 8], "children": [10], "siblings": [], "batch_size": 3, "input_width": 200, "input_height": 201, "input_channel": 202, "input_addr": "0x98", "residual_addr": "0x99", "quant_addr": "0x8", "output_addr": "0x100", "input_size": 8, "output_size": 8},
{"op": "null", "runid": 10, "instr_name": "ACTIVE", "parents": [9], "children": [], "siblings": [], "batch_size": 4, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 0, "output_height": 0, "output_channel": 0, "input_addr": "0x31", "quant_addr": "0x8", "output_addr": "0x32", "input_size": 25690112, "output_size": 25690112 }
]

@ -15,7 +15,7 @@ cd ..
cd python; python setup.py install; cd ..
cd topi/python; python setup.py install; cd ../..
#cd nnvm/python; python setup.py install; cd ../..
cd nnvm/python; python setup.py install; cd ../..
cd ..

Loading…
Cancel
Save