add version v_nnvm

14 changed files with 145 additions and 384 deletions
--- a/.gitmodules
+++ b/.gitmodules
--- a/README.md
+++ b/README.md
@ -1,21 +1,30 @@
-![Image text](https://gitee.com/inspur-inna/inspur-inna/raw/master/Image/inspur.png)
+![Image text](https://github.com/inspur-inna/inspur-inna/blob/master/Image/inspur.png)

-# 基于FPGA的CNN自适应映射技术——inna1.0
+# 基于FPGA的CNN自适应映射技术---inspur-inna
+
+基于宏指令的Look-Aside Acceleration框架：
+
+- 一键式快速部署
+- 软硬件协同优化
+- 支持多种卷积
+- 执行过程无需主机干预

-基于FPGA板卡设计深度学习加速器并进行优化，在整体性能和功耗方面拟达到业界领先水平，映射技术采用宏指令的Look-Aside Acceleration框架，实现了一键式快速部署、软硬件协同优化、支持多种卷积、执行过程无需主机干预。本项目为映射技术的软件端，拟实现CNN映射编译器和CNN量化器，首先由TensorFlow产生的模型文件解析产生CNN的计算图模型，CNN映射编译器会根据解析的计算图和现有的CNN加速库单元，选择相应的CNN库单元，生成相应的硬件结构和相应的调度器的配置参数，以达到计算、片上存储、片上带宽和片外带宽的均衡，从而达到最优的计算性能；CNN量化器可根据模型的权重文件，对各层数据进行8位定点量化，以便于FPGA的DSP计算，从而在保证精度的前提下降低存储开销，提高处理速度，降低功耗。


 ## Install

-### inna install
-TVM need LLVM，LLVM install in Ubuntu（other system require source code compilation）
+### TVM source code install
+LLVM install in Ubuntu
 ```bash
 apt search llvm
 apt install llvm-6.0
 apt install clang-6.0
 ```

-Install miniconda for python=3.6，install_inna.sh include TVM install script（refer to TVM <https://tvm.apache.org/docs/install/from_source.html>）
+TVM Install Source<https://tvm.apache.org/docs/install/from_source.html>
+
+### inna install
+Install miniconda for python=3.6
 ```bash
 conda create -n inna python=3.6 ipykernel -y
 conda activate inna
--- a/inna/compiler/applications/resnet/tensorflow.py
+++ b/inna/compiler/applications/resnet/tensorflow.py
@ -10,18 +10,18 @@ import os
 def resnet_v1_50():
    with tf.Graph().as_default():
        model = {
-            'pb': '../models/tensorflow/resnet/resnet50_without_bn_test.pb',
+            'pb': '../models/tensorflow/resnet/frozen_resnet_v1_50.pb',
            'shape_dict': {
-                'input': (1, 224, 224, 3), 
+                'input': (1, 224, 224, 3),
            },
            'layout': 'NHWC',
-            'out_node': 'prob', 
+            'out_node': 'resnet_v1_50/predictions/Reshape_1',
        }
        with tf.gfile.GFile(model['pb'], 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            graph = tf.import_graph_def(graph_def, name='')
-            # Call the utility to import the graph definition into default graptributeError: module 'tensorflow.tools.api.generator.api.compat' has no attribute 'v1'h.
+            # Call the utility to import the graph definition into default graph.
            graph_def = tf_testing.ProcessGraphDefParam(graph_def)

            with tf.Session() as sess:
--- a/inna/compiler/assembler.py
+++ b/inna/compiler/assembler.py
@ -30,8 +30,6 @@ SET_REG_HISTORY = {}
 CHILDREN_HISTORY = {}
 INSTR_CODE_DICT = {
    'CONV': 0x000,
-    'SPLIT': 0x100,
-    'CONCAT': 0x101,
    'ACTIVE': 0x107,
    'ELTWISE': 0x108,
    'MOVH': 0x200,
@ -44,13 +42,13 @@ INSTR_CODE_DICT = {

 # register can be classified as | wait register | param regitster | set regitster |
 def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings=[], batch_size=4, conv_type=0,
-                          active_type=0, pool_type=0, eltwise_type=0, concat_channel=0, input_width=0, input_height=0, input_channel=0, input_mat_width=0,
+                          active_type=0, pool_type=0, input_width=0, input_height=0, input_channel=0, input_mat_width=0,
                          input_mat_height=0, input_vect_width=0, input_vect_height=0, output_width=0, output_height=0,
                          output_channel=0, input_addr=0, filter_addr=0, quant_addr=0, residual_addr=0, output_addr=0,
                          data_addr=0, input_mat_addr=0, input_vect_addr=0, input_bias_addr=0, output_mat_addr=0,
                          h_pad=0, v_pad=0, stride=1, kernel_size=1, input_size=0, output_size=0, width_shift=0,
                          width_size=0, height_shift=0, height_size=0, channel_shift=0, channel_size=0, op='', name='',
-                          attrs={}, shape=[], input_shape=[],output_shape=[]):
+                          attrs={}, shape=[], input_shape=[]):
    regs_params = []
    input_feature_size4 = int(input_size / MAX_HW_BATCH)
    output_feature_size4 = int(output_size / MAX_HW_BATCH)
@ -67,16 +65,8 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
            reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
            reg3 = ((width_shift & 0xFFF) << 48) | (filter_addr & ADDR_MASK48)
            reg4 = ((width_size & 0xFFF) << 48) | (quant_addr & ADDR_MASK48)
-            '''
-            #before 20191128, add activate into conv
-            #reg5 = output_addr & ADDR_MASK48
-            #reg6 = ((conv_type & 0x7) << 61) | ((kernel_size & 0xF) << 57) | ((h_pad & 0x7) << 54) | (
-            #        (v_pad & 0x7) << 51) | ((stride & 0x7) << 48) | ((input_width & 0xFFF) << 36) | (
-            #              (input_height & 0xFFF) << 24) | ((input_channel & 0xFFF) << 12) | (output_channel & 0xFFF)
-            '''
-
-            reg5 = ((active_type & 0x7) << 52) | ((conv_type & 0xf) << 48) | (output_addr & ADDR_MASK48)
-            reg6 = ((kernel_size & 0xF) << 57) | ((h_pad & 0x7) << 54) | (
+            reg5 = output_addr & ADDR_MASK48
+            reg6 = ((conv_type & 0x7) << 61) | ((kernel_size & 0xF) << 57) | ((h_pad & 0x7) << 54) | (
                    (v_pad & 0x7) << 51) | ((stride & 0x7) << 48) | ((input_width & 0xFFF) << 36) | (
                           (input_height & 0xFFF) << 24) | ((input_channel & 0xFFF) << 12) | (output_channel & 0xFFF)
            reg7 = ((height_shift & 0xFFF) << 36) | ((height_size & 0xFFF) << 24) | ((channel_shift & 0xFFF) << 12) | (
@ -87,7 +77,7 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
                        runid, instr_name, reg2, reg3, reg4, reg5, reg6, reg7)
            regs_params.append([reg2, reg3, reg4, reg5, reg6, reg7])

-    elif instr_name in ['ACTIVE', 'ATTACH', 'RESHAPE', 'SPLIT']:
+    elif instr_name in ['ACTIVE', 'ATTACH', 'CONCAT', 'RESHAPE', 'SPLIT']:
        for _ in range(MAX_HW_BATCH):
            reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
            reg3 = ((active_type & 0x7) << 48) | (output_addr & ADDR_MASK48)
@ -103,22 +93,12 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
            output_addr += output_feature_size4
            regs_params.append([reg2, reg3, reg4, reg5, reg6, reg7])

-    elif instr_name in ['CONCAT']:
-        for _ in range(MAX_HW_BATCH):
-            reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
-            reg3 = (output_addr & ADDR_MASK48)
-            reg4 = ((concat_channel & 0xFFF) << 48) | ((input_width & 0xFFF) << 36) | ((input_height & 0xFFF) << 24) | ((input_channel & 0xFFF) << 12)
-            logger.info('generate register parameters %d %s reg2=%#x reg3=%#x', runid, instr_name, reg2, reg3, reg4)
-            input_addr += input_feature_size4
-            output_addr += input_feature_size4 + output_feature_size4
-            regs_params.append([reg2, reg3, reg4])
-
    elif instr_name in ['ELTWISE']:
        for _ in range(MAX_HW_BATCH):
            reg2 = ((invalid_wait & 0x3) << 56) | ((batch_size & 0xFF) << 48) | (input_addr & ADDR_MASK48)
            reg3 = ((input_width & 0xFFF) << 48) | (quant_addr & ADDR_MASK48)
-            reg4 = ((eltwise_type & 0x3) << 60) | ((input_height & 0xFFF) << 48) | (residual_addr & ADDR_MASK48)
-            reg5 = ((active_type & 0x3) << 60) | ((input_channel & 0xFFF) << 48) | (output_addr & ADDR_MASK48)
+            reg4 = ((input_height & 0xFFF) << 48) | (residual_addr & ADDR_MASK48)
+            reg5 = ((input_channel & 0xFFF) << 48) | (output_addr & ADDR_MASK48)
            logger.info('generate register parameters %d %s reg2=%#x reg3=%#x reg4=%#x reg5=%#x',
                        runid, instr_name, reg2, reg3, reg4, reg5)
            input_addr += input_feature_size4
@ -146,12 +126,10 @@ def _generate_regs_params(instr_name, runid=0, parents=[], children=[], siblings
    return regs_params


-def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
-    if instr_name in ['CONV', 'ACTIVE', 'ATTACH', 'RESHAPE', 'SPLIT']:
+def _allocate_physical_regs(runid, instr_name, parents, children):
+    if instr_name in ['CONV', 'ACTIVE', 'ATTACH', 'CONCAT', 'RESHAPE', 'SPLIT']:
        reg_num = 8
-    elif instr_name in ['ELTWISE']:
-        reg_num = 7
-    elif instr_name in ['POOL', 'CONCAT']:
+    elif instr_name in ['ELTWISE', 'POOL']:
        reg_num = 6
    else:
        print('Error: {} not support!'.format(instr_name))
@ -174,7 +152,7 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
            sys.exit(1)

        batchi_regs = [0 for _ in range(reg_num)]
-        num = 1 if instr_name not in ['ELTWISE', 'CONCAT'] else 2
+        num = 1
        regid = 0
        needed_reg_num = reg_num - 1 if len(children) == 0 else reg_num
        while num < needed_reg_num:
@ -186,34 +164,6 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
            regid += 1

        # set wait register
-        if len(parents) > 0:
-            # max_parent_id = max(parents)
-            for i, parent_id in enumerate(parents):
-                if parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[parent_id]):
-                    wait_regid = SET_REG_HISTORY[parent_id][batchi]
-                    batchi_regs[i] = wait_regid
-
-            if len(parents) > 1 and type == 1:
-                max_parent_id = max(parents)
-                for i, parent_id in enumerate(parents):
-                    if parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[parent_id]):
-                        wait_regid = SET_REG_HISTORY[max_parent_id][batchi]
-                        batchi_regs[i] = wait_regid
-
-            if len(parents) > 0:
-                max_parent_id = max(parents)
-                if max_parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[max_parent_id]):
-                    # free parents' set regitster
-                    max_brother_id = max(CHILDREN_HISTORY[max_parent_id])
-                    if runid == max_brother_id:
-                        for parent_id in parents:
-                            parent_regid = SET_REG_HISTORY[parent_id][batchi]
-                            REG_INUSED[parent_regid] = False
-                            FREE_REG_NUM += 1
-                            if batchi == MAX_HW_BATCH - 1:
-                                del SET_REG_HISTORY[parent_id]
-                                del CHILDREN_HISTORY[parent_id]
-        '''
        if len(parents) > 0:
            max_parent_id = max(parents)
            if max_parent_id in SET_REG_HISTORY.keys() and len(CHILDREN_HISTORY[max_parent_id]):
@ -230,9 +180,8 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
                        if batchi == MAX_HW_BATCH - 1:
                            del SET_REG_HISTORY[parent_id]
                            del CHILDREN_HISTORY[parent_id]
-        '''

-        PARAM_REG_HISTORY = batchi_regs[1:-1] if instr_name not in ['ELTWISE', 'CONCAT'] else batchi_regs[2:-1]
+        PARAM_REG_HISTORY = batchi_regs[1:-1]
        set_regs.append(batchi_regs[-1])
        regs.append(batchi_regs)

@ -248,19 +197,11 @@ def _allocate_physical_regs(runid, instr_name, parents, children, type=0):
 def _pseudo_to_assems(instr_name, regs, reg_params):
    assems = []
    reg_param_num = len(reg_params)
-    if instr_name in ['ELTWISE', 'CONCAT']:
-        for i in range(reg_param_num):
-            assem = ['MOVH', regs[i + 2], reg_params[i] >> 32]
-            assems.append(assem)
-            assem = ['MOVL', regs[i + 2], reg_params[i] & 0xFFFFFFFF]
-            assems.append(assem)
-    else:
-        for i in range(reg_param_num):
-            assem = ['MOVH', regs[i + 1], reg_params[i] >> 32]
-            assems.append(assem)
-            assem = ['MOVL', regs[i + 1], reg_params[i] & 0xFFFFFFFF]
-            assems.append(assem)
-
+    for i in range(reg_param_num):
+        assem = ['MOVH', regs[i + 1], reg_params[i] >> 32]
+        assems.append(assem)
+        assem = ['MOVL', regs[i + 1], reg_params[i] & 0xFFFFFFFF]
+        assems.append(assem)
    assems.append([instr_name] + regs)
    return assems

@ -283,10 +224,8 @@ def _assem_to_machine_code(assem, batchi):


 def _pseudo_to_machine_code(pseudo):
-    print(pseudo)
    regs_params = _generate_regs_params(**pseudo)
-    type = (pseudo['eltwise_type'] & 0x2 >> 1) if pseudo['instr_name'] in ['ELTWISE'] else 0
-    regs = _allocate_physical_regs(pseudo['runid'], pseudo['instr_name'], pseudo['parents'], pseudo['children'], type)
+    regs = _allocate_physical_regs(pseudo['runid'], pseudo['instr_name'], pseudo['parents'], pseudo['children'])
    machine_codes = []
    for batchi in range(MAX_HW_BATCH):
        batchi_reg_params = regs_params[batchi]
--- a/inna/compiler/compiler.py
+++ b/inna/compiler/compiler.py
@ -7,7 +7,6 @@ from inna.compiler import converter
 from inna.compiler import scheduler
 from inna.compiler import assembler

-import tvm.relay

 class INNACompiler:
    """Compiler Class"""
@ -42,9 +41,9 @@ class INNACompiler:
            instruction stream represented by np.NDArray
        """
        assert layout in ['NCHW', 'NHWC']
-
-        tvm_graph, params = frontend.to_tvm(graph, shape_dict, layout, self._mode)
-        hwgraph = converter.convert(tvm_graph, shape_dict, layout)
+        nnvm_graph, params = frontend.to_nnvm(graph, shape_dict, layout, self._mode)
+        jsong = nnvm_graph.json()
+        hwgraph = converter.convert(nnvm_graph, shape_dict, layout)
        scheduler.schedule(hwgraph)
        instr_streams = assembler.assemble(hwgraph)
        return instr_streams
@ -99,3 +98,4 @@ if __name__ == '__main__':
        outfile_name = 'out/resnet_v1_50_tensorflow.bin'
        with open(outfile_name, 'wb') as f:
            f.write(instr_streams.tobytes())
+
--- a/inna/compiler/converter.py
+++ b/inna/compiler/converter.py
@ -4,55 +4,19 @@ from __future__ import print_function


 def _parse_inputs(graph):
-    shape_list = graph['attrs']['shape'][1]
-    for i, node in enumerate(graph['nodes']):
-        #print(node)
+    for node in graph.index.nodes:
        inputs = node['inputs']
-        node['shape'] = []
        if len(inputs) > 0:
            node['inputs'] = [inp[0] for inp in node['inputs']]
-            node['shape'] = [shape_list[j] for j in node['inputs']]
-        node['shape'].append(shape_list[i])


 def _remove_null_op_inputs(graph, shape_dict):
-    for node in graph['nodes']:
+    index = graph.index
+    for node in index.nodes:
        if node['name'] in shape_dict:
            node['op'] = 'input'
        if node['op'] != 'null':
-            node['inputs'] = [inp for inp in node['inputs'] if graph['nodes'][inp]['op'] != 'null']
-
-
-def _parse_op(graph):
-    for node in graph['nodes']:
-        if node['name'] in ['data', 'input']:
-            node['op'] = node['name']
-            continue
-        if node['op'] in ['null']:
-            continue
-        #print(node)
-        name_str = node['attrs']['func_name']
-        name_list = name_str.split('_')
-        node['op'] = ''
-        for op_name in name_list:
-            if (op_name not in ['fused','nn']) and (op_name.isdigit()==False):
-                node['op'] = node['op'] + op_name + '_'
-        node['op'] = node['op'].strip('_')
-        if node['op'] in ['add']:
-            if len(node['inputs']) > 1:
-                i = 0
-                for j in node['inputs']:
-                    if graph['nodes'][j]['op'] == 'null':
-                        break
-                    i += 1
-                if i == len(node['inputs']):
-                    node['op'] = 'eltwise_add'
-        if node['op'] in ['transpose']:
-            node['op'] = 'null'
-        #if node['op'] in ['pad1','pad2','pad3','pad4','pad5']:
-            #node['op'] = 'pad'
-        #if node['op'] in ['relu1','relu2']:
-            #node['op'] = 'relu'
+            node['inputs'] = [inp for inp in node['inputs'] if index.nodes[inp]['op'] != 'null']


 def _get_pad_width(pad_width, layout):
@ -63,38 +27,35 @@ def _fuse_ops(graph, shape_dict, layout):
    hardware_graph = []
    new_index = 0
    index_map = {}
+    index = graph.index
    # deal with batch_norm、pad
-    for node in graph['nodes']:
+    for node in index.nodes:
        if node['op'] == 'conv2d':
            node['attrs']['batch_norm'] = False
-            node['attrs']['use_bias'] = False
        if node['op'] == 'batch_norm':
-            graph['nodes'][node['inputs'][0]]['attrs']['batch_norm'] = True
-        if node['op'] == 'add':
-            graph['nodes'][node['inputs'][0]]['attrs']['use_bias'] = True
-        input_nodes = [graph['nodes'][inp] for inp in node['inputs']]
+            index.nodes[node['inputs'][0]]['attrs']['batch_norm'] = True
+        input_nodes = [index.nodes[inp] for inp in node['inputs']]
        for i, inp_node in enumerate(input_nodes):
-            #print(inp_node)
-            if inp_node['op'] in ['batch_norm', 'pad', 'add']:
+            if inp_node['op'] in ['batch_norm', 'pad']:
                node['inputs'][i] = inp_node['inputs'][0]
            if inp_node['op'] == '__mul_scalar__':
                if len(inp_node['inputs']) == 0:
                    del node['inputs'][i]
                else:
                    node['inputs'][i] = inp_node['inputs'][0]
-            #if inp_node['op'] == 'pad':
-                #padding = _get_pad_width(inp_node['attrs']['pad_width'], layout)
-                #node['attrs']['padding'] = padding
+            if inp_node['op'] == 'pad':
+                padding = _get_pad_width(inp_node['attrs']['pad_width'], layout)
+                node['attrs']['padding'] = padding

    # remove null、batch_norm、pad op
-    for i, node in enumerate(graph['nodes']):
-        if node['op'] not in ['null', 'batch_norm', 'pad', '__mul_scalar__', 'add', 'transpose']:
+    for i, node in enumerate(index.nodes):
+        if node['op'] not in ['null', 'batch_norm', 'pad', '__mul_scalar__']:
            hardware_graph.append(node)
            index_map[i] = new_index
            new_index += 1
    # reset node inputs
    for node in hardware_graph:
-        inputs = [index_map[inp] for inp in node['inputs']] if node['op'] not in ['input','data'] else []
+        inputs = [index_map[inp] for inp in node['inputs']] if node['op'] != 'input' else []
        node['inputs'] = inputs
    return hardware_graph

@ -104,8 +65,7 @@ def _layout_to_nchw(shape, layout):
        return shape
    return [shape[0], shape[3], shape[1], shape[2]] if layout == 'NHWC' else shape

-def _parse_shape_to_nchw(hwgraph, shape_dict, layout):
-    '''
+def _parse_shape_to_hchw(hwgraph, shape_dict, layout):
    for node in hwgraph:
        node['shape'] = []
        node_name = node['name']
@ -120,12 +80,6 @@ def _parse_shape_to_nchw(hwgraph, shape_dict, layout):
            node['shape'] = _layout_to_nchw(attrs['shape'], layout)
        elif '__shape__' in node['attrs']:
            node['shape'] = _layout_to_nchw(attrs['__shape__'], layout)
-    '''
-    for node in hwgraph:
-        if 'input_shape' in node:
-            node['input_shape'] = _layout_to_nchw(node['input_shape'], layout)
-        if 'output_shape' in node:
-            node['output_shape'] = _layout_to_nchw(node['output_shape'], layout)


 def _clac_shape(in_shape, kernel, pad, stride, channels):
@ -135,75 +89,48 @@ def _clac_shape(in_shape, kernel, pad, stride, channels):

 def _infer_shape(hwgraph):
    for i, node in enumerate(hwgraph):
-        if node['op'] in ['input', 'data']:
-            continue
-        if len(node['inputs']) < 1:
+        if node['op'] == 'input':
            continue
+
        first_input = node['inputs'][0]
-        input_shape = hwgraph[first_input]['shape'][-1]
+        input_shape = hwgraph[first_input]['shape']
        node['input_shape'] = input_shape
-        output_shape = node['shape'][-1]
-        node['output_shape'] = output_shape

        if node['op'] == 'conv2d':
            attrs = node['attrs']
            assert len(input_shape) > 0
-            kernel_shape = node['shape'][1]
-            attrs['kernel_size'] = [kernel_shape[0],kernel_shape[1]]
-            attrs['channels'] = kernel_shape[3]
-            attrs['padding'] = [int(kernel_shape[0]/2), int(kernel_shape[1]/2)]
-            output_shape = node['shape'][-1]
-            attrs['strides'] = [int(input_shape[1]/output_shape[1]), int(input_shape[2]/output_shape[2])]
-
-            #node['shape'] = _clac_shape(input_shape, attrs['kernel_size'],
-                                        #attrs.get('padding', [0, 0]), attrs['strides'], attrs['channels'])
-
+            node['shape'] = _clac_shape(input_shape, attrs['kernel_size'],
+                                        attrs.get('padding', [0, 0]), attrs['strides'], attrs['channels'])
        elif node['op'] in ['max_pool2d', 'avg_pool2d']:
            attrs = node['attrs']
-            #padding = attrs.get('padding', [0, 0])
-
-            if node['op'] in ['max_pool2d']:
-                attrs['pool_size'] = [3, 3]
-            else:
-                attrs['pool_size'] = [7, 7]
-            attrs['padding'] = [0, 0, int(attrs['pool_size'][0]/2), int(attrs['pool_size'][1]/2)]
-
-            attrs['strides'] = [int(output_shape[1]/input_shape[1]), int(output_shape[2]/input_shape[2])]
-
-            #node['shape'] = _clac_shape(input_shape, attrs['pool_size'],
-                                        #attrs['padding'], attrs['strides'], input_shape[1])
-
+            padding = attrs.get('padding', [0, 0])
+            node['shape'] = _clac_shape(input_shape, attrs['pool_size'],
+                                        padding, attrs['strides'], input_shape[1])
        elif node['op'] in ['global_avg_pool2d', 'mean']:
-            #node['output_shape'] = [input_shape[0], input_shape[1], 1, 1]
-            continue
+            node['shape'] = [input_shape[0], input_shape[1], 1, 1]
        elif node['op'] == 'flatten':
-            #node['output_shape'] = input_shape[:2]
-            continue
+            node['shape'] = input_shape[:2]
        elif node['op'] == 'dense':
            # TODO
            if len(input_shape) == 0:
-                #node['output_shape'] = [1, node['attrs']['units']]
+                node['shape'] = [1, node['attrs']['units']]
                continue
-            #node['shape'] = [input_shape[0], node['attrs']['units']]
-
-        elif node['op'] in ['pad', 'batch_norm', 'relu', 'eletwise_add', 'broadcast_add', 'softmax']:
-            #node['shape'] = input_shape
-            continue
+            node['shape'] = [input_shape[0], node['attrs']['units']]
+        elif node['op'] in ['pad', 'batch_norm', 'relu', 'elemwise_add', 'broadcast_add', 'softmax']:
+            node['shape'] = input_shape

    # remove input node
    new_hwgraph = []
    index_map = {}
    new_index = 0
    for i, node in enumerate(hwgraph):
-        if node['op'] not in ['input', 'data']:
+        if node['op'] != 'input':
            new_hwgraph.append(node)
            index_map[i] = new_index
            new_index += 1
    # reset node inputs
    for i, node in enumerate(new_hwgraph):
-        if len(node['inputs']) < 1:
-            continue
-        if hwgraph[node['inputs'][0]]['op'] in ['input', 'data']:
+        if hwgraph[node['inputs'][0]]['op'] == 'input':
            inputs = []
        else:
            inputs = [index_map[inp] for inp in node['inputs']]
@ -215,16 +142,12 @@ def convert(graph, shape_dict, layout):
    """convert nnvm graph to hardware supported graph"""
    _parse_inputs(graph)

-    _parse_op(graph)
-
    _remove_null_op_inputs(graph, shape_dict)

    hwgraph = _fuse_ops(graph, shape_dict, layout)

-    hwgraph = _infer_shape(hwgraph)
+    _parse_shape_to_hchw(hwgraph, shape_dict, layout)

-    _parse_shape_to_nchw(hwgraph, shape_dict, layout)
-
-    #print(hwgraph)
+    hwgraph = _infer_shape(hwgraph)

-    return hwgraph
+    return hwgraph
--- a/inna/compiler/frontend.py
+++ b/inna/compiler/frontend.py
@ -2,48 +2,54 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import tvm
-import tvm.relay
-
+import nnvm
+import nnvm.graph
+from nnvm.compiler import graph_attr, graph_util
 import re
-import json

 FRAME_SUPPORTED = ('tensorflow', 'keras', 'mxnet', 'onnx')
-
-ATTR_NEED_CONVERT = ("num_outputs", "num_inputs", "flatten_data")
-
+ATTR_NEED_CONVERT = ('channels', 'dilation', 'kernel_size',
+                     'padding', 'strides', 'pool_size',
+                     'axis', 'pad_value', 'pad_width',
+                     '__shape__', 'shape', 'groups')
+
+
+def _str_to_list(src):
+    matchs = re.findall(r'([\[|\(]-*?\d+.*?[\]|\)])', src)
+    ret = []
+    for match in matchs:
+        sub_matchs = re.findall(r'(-*?\d+)', match)
+        ret.append([i for i in map(int, sub_matchs)])
+    if len(ret) == 0:
+        ret = int(src)
+    elif len(ret) == 1:
+        ret = ret[0]
+    return ret

 def _attrstr_to_number(graph):
-    nodes = graph['nodes']
-    for node in nodes:
+    index = graph.index
+    for node in index.nodes:
        if 'attrs' not in node:
            continue
        for attr_name, attr_val in node['attrs'].items():
            if attr_name in ATTR_NEED_CONVERT:
-                node['attrs'][attr_name] = int(attr_val)
+                node['attrs'][attr_name] = _str_to_list(attr_val)

-def to_tvm(graph, shape_dict, layout, mode='tensorflow'):
+
+def to_nnvm(graph, shape_dict, layout, mode='tensorflow'):
    """convert frontend graph to nnvm graph"""
    assert mode in FRAME_SUPPORTED
    if mode == 'tensorflow':
-        mod, params = tvm.relay.frontend.from_tensorflow(graph, layout=layout, shape=shape_dict)
+        sym, params = nnvm.frontend.from_tensorflow(graph, layout=layout, shape=shape_dict)
    elif mode == 'keras':
-        mod, params = tvm.relay.frontend.from_keras(graph)
+        sym, params = nnvm.frontend.from_keras(graph)
    elif mode == 'mxnet':
-        mod, params = tvm.relay.frontend.from_mxnet(graph)
+        sym, params = nnvm.frontend.from_mxnet(graph)
+        sym = nnvm.sym.softmax(sym)
    else:
-        mod, params = tvm.relay.frontend.from_onnx(graph)
-
-    mod = tvm.relay.transform.InferType()(mod)
-
-    target = 'llvm'
-    target_host = 'llvm'
-    with tvm.relay.build_config(opt_level=0):
-        tvm_graph_json, lib, params = tvm.relay.build(mod, target=target, target_host=target_host, params=params)
+        sym, params = nnvm.frontend.from_onnx(graph)
+    nnvm_graph = nnvm.graph.create(sym)

-    #with open("./json/resnet_v1_50_tvm_0.json", 'w') as fp:
-        #fp.write(tvm_graph)
-    tvm_graph = json.loads(tvm_graph_json)
-    _attrstr_to_number(tvm_graph)
+    _attrstr_to_number(nnvm_graph)

-    return tvm_graph, params
+    return nnvm_graph, params
--- a/inna/compiler/scheduler.py
+++ b/inna/compiler/scheduler.py
@ -40,10 +40,9 @@ def _shape_to_size(shape, maxbatch, hwbatch):

 def _calc_inout_size(graph):
    for node in graph:
-        print(node)
        node['input_size'] = _shape_to_size(node['input_shape'],
                                            MAX_HW_BATCH, HW_BATCH_SIZE)
-        node['output_size'] = _shape_to_size(node['output_shape'], MAX_HW_BATCH, HW_BATCH_SIZE)
+        node['output_size'] = _shape_to_size(node['shape'], MAX_HW_BATCH, HW_BATCH_SIZE)


 def _alloc_param_memory(graph):
@ -53,9 +52,9 @@ def _alloc_param_memory(graph):
        if node['op'] == 'conv2d':
            node['filter_addr'] = filter_addr
            attrs = node['attrs']
-            filter_addr += _aligned(attrs['channels'], 64) * _aligned(node['output_shape'][1], 64) \
+            filter_addr += _aligned(attrs['channels'], 64) * _aligned(node['shape'][1], 64) \
                           * attrs['kernel_size'][0] * attrs['kernel_size'][1]
-        if node['op'] in ['conv2d', 'relu', 'max_pool2d', 'eltwise_add',
+        if node['op'] in ['conv2d', 'relu', 'max_pool2d', 'elemwise_add',
                          'boardcast_add', 'mean', 'global_avg_pool2d']:
            node['quant_addr'] = quant_addr
            quant_addr += 256 * 2
@ -69,7 +68,7 @@ def _alloc_feature_memory(graph):
        feature_addr[iotype] += node['input_size']
        node['output_addr'] = feature_addr[1 - iotype]
        feature_addr[1 - iotype] += node['output_size']
-        if node['op'] == 'eltwise_add':
+        if node['op'] == 'elemwise_add':
            node['residual_addr'] = graph[node['parents'][1]]['output_addr']

 def _alloc_memory(graph):
@ -91,8 +90,8 @@ def _convert_to_assem_graph(graph):
        node['batch_size'] = HW_BATCH_SIZE
        _, node['input_channel'], node['input_height'], node['input_width'] = node['input_shape'] \
            if len(node['input_shape']) == 4 else node['input_shape'] + [1, 1]
-        _, node['output_channel'], node['output_height'], node['output_width'] = node['output_shape'] \
-            if len(node['output_shape']) == 4 else node['output_shape'] + [1, 1]
+        _, node['output_channel'], node['output_height'], node['output_width'] = node['shape'] \
+            if len(node['shape']) == 4 else node['shape'] + [1, 1]
        if node['op'] == 'conv2d':
            node['instr_name'] = 'CONV'
            attrs = node['attrs']
@ -111,13 +110,8 @@ def _convert_to_assem_graph(graph):
            attrs = node['attrs']
            node['kernel_size'] = attrs['pool_size'][0]
            node['stride'] = attrs['strides'][0]
-        elif node['op'] in ['eltwise_add', 'broadcast_add']:
+        elif node['op'] in ['elemwise_add', 'broadcast_add']:
            node['instr_name'] = 'ELTWISE'
-            runid = min(node['parents'])
-            if graph[runid]['parents'] >= graph[runid+1]['parents']:
-                node['eltwise_type'] = 1
-            else:
-                node['eltwise_type'] = 2
        elif node['op'] in ['global_avg_pool2d', 'mean', 'avg_pool2d']:
            node['instr_name'] = 'POOL'
            node['pool_type'] = 1
--- a/inna/config.ini
+++ b/inna/config.ini
@ -4,9 +4,6 @@ max_hw_batch=1
 instrs_addr=0x0
 weights_addr=0x220000000
 quant_addr=0x210000000
-input_feature_addr=0x1000000
-output_feature_addr=0xc000000
-per_output_feature_size=2048
-fullconv_weight_addr=0x168d000
-fullconv_quant_addr=0x1ac40
-class_num=1000
+input_feature_addr=0x100000
+output_feature_addr=0x100000000
+per_output_feature_size=100
--- a/inna/models/tensorflow/resnet/resnet50_without_bn_test.pb
+++ b/inna/models/tensorflow/resnet/resnet50_without_bn_test.pb
--- a/inna/runtime/runtime.py
+++ b/inna/runtime/runtime.py
@ -4,13 +4,13 @@ from __future__ import print_function

 import os
 import cv2
+import numpy as np
 import configparser
 import numpy as np
 import math
+
 from inna.runtime import _runtime

-import argparse
-import pdb

 def check_equal(tname, test, real):
    """
@ -43,38 +43,6 @@ def _softmax(x):
    return ex / np.sum(ex, axis=1)


-def _fullconv(x, weights, weights_addr, quant, quant_addr, class_num):
-    #pdb.set_trace()
-    x = x.astype(np.int8)
-    np_weight = weights[weights_addr:]
-    np_weight = np_weight.astype(np.int8)
-    weight_size = np_weight.shape[0]
-
-    np_quant = quant[quant_addr:]
-    quant_table = np_quant[0]
-    np_quant_int32 = np.fromstring(np_quant[64:].tostring(), dtype=np.int32)
-
-    np_quant_int32_sort = np_quant_int32[:class_num]
-    np_weight = np_weight.reshape([class_num, int(weight_size/class_num)])
-
-    x.tofile("x.bin")
-    np_weight.tofile("weight.bin")
-    np_quant.tofile("bias.bin")
-
-    np_weight = np_weight.astype(np.int32)
-    x = x.astype(np.int32)
-    y = np.zeros((x.shape[0], class_num), dtype=np.float)
-
-    for k in range(x.shape[0]):
-        temp = np.dot(np_weight, x[k])
-        temp = temp + np_quant_int32_sort
-        temp = temp / (2 ** quant_table)
-        y[k] = temp
-    #print(" fullconv y=")
-    #print(y)
-    return y
-
-
 def _feature_reshape(data, layout='CHW'):
    assert layout in ['CHW', 'HWC']
    if layout == 'CHW':
@ -88,11 +56,7 @@ def _feature_reshape(data, layout='CHW'):
        if end > c:
            end = c
        cnum = end - start
-        #outdata[oci, :, :, 0: cnum] = data[:, :, start: end]
-
-        for i in range(cnum):
-            outdata[oci, :, :, 63-i] = data[:, :, i]
-
+        outdata[oci, :, :, 0: cnum] = data[:, :, start: end]
    return outdata


@ -122,13 +86,7 @@ def _load_images(img_dir, batch_size):
    for filename in os.listdir(img_dir):
        if filename.endswith('.JPEG'):
            img = cv2.imread(img_dir + filename)
-            img = cv2.resize(img, (224, 224)) #, interpolation=cv2.INTER_CUBIC)
-            #print(img.shape)
-            img = img.astype(np.float)
-            img[..., 0] = (img[..., 0] - 103.939) / 2.0
-            img[..., 1] = (img[..., 1] - 116.779) / 2.0
-            img[..., 2] = (img[..., 2] - 123.68) / 2.0
-
+            img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
            img = _feature_reshape(img, 'HWC')
            imgs.append(img)
            cur_img += 1
@ -147,7 +105,6 @@ def _aligned64(arr):
        arr = np.concatenate((arr, pad_data), axis=0)
    return arr

-
 def _get_addrs():
    instrs_addr = int(config.get('config', 'instrs_addr'), 16)
    assert instrs_addr % 64 == 0
@ -157,7 +114,6 @@ def _get_addrs():
    assert quant_addr % 64 == 0
    return instrs_addr, weights_addr, quant_addr

-
 def _create_model_datainfo(instrs, weights, quant_table):
    instrs_addr, weights_addr, quant_addr = _get_addrs()
    instrs = _aligned64(instrs)
@ -165,7 +121,6 @@ def _create_model_datainfo(instrs, weights, quant_table):
    quant_table = _aligned64(quant_table)
    return [instrs_addr, instrs, weights_addr, weights, quant_addr, quant_table]

-
 class INNARuntime(_runtime.INNARuntime):
    def __init__(self, instrs, weights, quant_table):
        """
@ -182,11 +137,6 @@ class INNARuntime(_runtime.INNARuntime):
        self._ifeature_addr = int(config.get('config', 'input_feature_addr'), 16)
        self._ofeature_addr = int(config.get('config', 'output_feature_addr'), 16)
        self._ofeature_size = config.getint('config', 'per_output_feature_size')
-        self._fullconv_weight_addr = int(config.get('config', 'fullconv_weight_addr'), 16)
-        self._fullconv_quant_addr = int(config.get('config', 'fullconv_quant_addr'), 16)
-        self._class_num = config.getint('config', 'class_num')
-        self._weights = weights
-        self._quant = quant_table
        self._model = _create_model_datainfo(instrs, weights, quant_table)
        self.ReloadNNModel(*self._model)

@ -205,7 +155,7 @@ class INNARuntime(_runtime.INNARuntime):
            result of neural network model
        """
        batchs = _load_images(img_dir, batch_size)
-        outputs = np.zeros((1, self._class_num), dtype=np.uint8)
+        outputs = np.zeros((1, self._ofeature_size), dtype=np.uin8)
        for batchi, batch_imgs in batchs:
            print(batchi, batch_imgs.shape)
            hwalign = batch_imgs.shape[0] % HW_BATCH_SIZE
@ -216,35 +166,15 @@ class INNARuntime(_runtime.INNARuntime):
                max_img_range = min((i + 1) * MAX_BATCH_SIZE, batch_imgs.shape[0])
                hw_batch_imgs = batch_imgs[i * MAX_BATCH_SIZE: max_img_range]
                print(hw_batch_imgs.shape, int(hw_batch_imgs.shape[0] / HW_BATCH_SIZE))
-                hw_batch_imgs.tofile("input_data")
                self.SetExtendHWBatch(int(hw_batch_imgs.shape[0] / HW_BATCH_SIZE))
                self.SetInputFeatures(self._ifeature_addr, hw_batch_imgs.reshape(-1))
                self.Run()
                self.Wait()
-                #####
-                '''
-                ofeatures_addr = 0x1000000
-                ofeatures_size = 0x1000000
-                for j in range(119):
-                    ofeatures_addr = ofeatures_addr + 0x1000000
-                    if ofeatures_addr == 0xc000000:
-                        ofeatures_addr = ofeatures_addr + 0x1000000
-                    ofeatures = self.GetOutputFeatures(ofeatures_addr,hw_batch_imgs.shape[0] * ofeatures_size)
-                    filename = os.path.dirname(os.path.abspath(__file__)) + '/mid_output/runid_' + str(j)
-                    with open(filename, 'wb') as fp:
-                        fp.write(ofeatures.tostring())
-                '''
-                #####
-
-                ofeatures = self.GetOutputFeatures(self._ofeature_addr,
-                                                   hw_batch_imgs.shape[0] * self._ofeature_size)
-
+                ofeatures = self.GetOutputFeatures(self._ofeature_addr, 
+                        hw_batch_img.shape[0] * self._ofeature_size)
                ofeatures = ofeatures.reshape(hw_batch_imgs.shape[0], -1)
-                #print(ofeatures)
-                #pdb.set_trace()
-                ofeatures = _fullconv(ofeatures, self._weights, self._fullconv_weight_addr, self._quant, self._fullconv_quant_addr, self._class_num)
-                ofeatures = _softmax(ofeatures)
-                outputs = np.row_stack((outputs, ofeatures))
+                ofeatures = _softmax(ofeature)
+                outputs = np.row_stack(outputs, ofeatures)
        return outputs[1:]


@ -271,51 +201,7 @@ def create(instrs, weights, quant_table):


 if __name__ == '__main__':
-    #from inna import runtime
-    #aa = np.array([124, 125], dtype=np.uint8)
-    #runtime = runtime.create(aa, aa, aa)
-    #runtime.run('images', 10)
-
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("path", default='~/resnet50_param/', help="input file path")
-    args = parser.parse_args()
-
-    basepath = args.path
-    instrs = np.fromfile(basepath+'instruction_conv.bin', dtype=np.uint8)
-    filters = np.fromfile(basepath + 'filter.bin', dtype=np.uint8)
-    quant_table = np.fromfile(basepath + 'quant.bin', dtype=np.uint8)
-    with open(basepath+'val.txt', 'r') as fp:
-        vtable = fp.read().split('\n')
-    vtable = [int(data.split(' ')[1]) for data in vtable if data!='']
-
-    #runtime = runtime.create(instrs, filters, quant_table)
-    runtime = create(instrs, filters, quant_table)
-    output = runtime.run('images', 1)
-
-    top1 = 0
-    top5 = 0
-    for i in range(int(output.shape[0])):
-        list_a = output[i].tolist()
-        max_v = max(list_a)
-        max_v_index = list_a.index(max(list_a))
-        print(i, end=" ")
-        if vtable[i] == max_v_index:
-            top1 += 1
-            top5 += 1
-            print("top1", max_v_index, max_v)
-            continue
-        else:
-            for j in range(4):
-                list_a[max_v_index] = 0
-                max_v = max(list_a)
-                max_v_index = list_a.index(max(list_a))
-
-                if vtable[i] == max_v_index:
-                    top5 += 1
-                    print("top5", max_v_index, max_v)
-                    break
-        print("")
-
-    print("top1=",top1)
-    print("top5=",top5)
+    from inna import runtime
+    aa = np.array([124, 125], dtype=np.uint8)
+    runtime = runtime.create(aa, aa, aa)
+    runtime.run('images', 10)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ numpy
 decorator
 attrs
 opencv-python
-tensorflow>=1.14.0
+tensorflow>=1.8.0
 keras
 mxnet
 torch
--- a/tests/data/resnet50_first_5_layers.json
+++ b/tests/data/resnet50_first_5_layers.json
@ -1,6 +1,13 @@
 [
-  {"runid": 0, "instr_name": "CONV", "parents": [], "children": [1], "siblings": [], "batch_size": 1, "conv_type": 0, "kernel_size": 7, "h_pad": 3, "v_pad": 3,  "stride": 2, "input_width": 224,  "input_height": 224,  "input_channel": 3,  "output_channel": 64,  "input_addr": "0x400000",  "filter_addr": "0x500000",  "quant_addr": "0x600000", "output_addr": "0x700000", "input_size": 3211264, "output_size": 802816, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
-  {"runid": 1, "instr_name": "ACTIVE", "parents": [0], "children": [2], "siblings": [], "batch_size": 1, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 112, "output_height": 112, "output_channel": 64, "input_addr": "0x700000", "quant_addr": "0x600000", "output_addr": "0x400000", "input_size": 802816, "output_size": 802816},
-  {"runid": 2, "instr_name": "POOL", "parents": [1], "children": [3], "siblings": [], "batch_size": 1, "pool_type": 3, "kernel_size": 3, "stride": 2, "input_width": 112, "input_height": 112, "input_channel": 64, "input_addr": "0x400000", "quant_addr": "0x600000", "output_addr": "0x700000", "input_size": 802816, "output_size": 200704},
-  {"runid": 3, "instr_name": "ELTWISE", "parents": [1, 2], "children": [], "siblings": [], "batch_size": 1, "input_width": 56, "input_height": 56, "input_channel": 256, "input_addr": "0x400000", "residual_addr": "0x800000", "quant_addr": "0x600100", "output_addr": "0x700000", "input_size": 802816, "output_size": 802816}
+  {"op": "null", "runid": 0, "instr_name": "CONV", "parents": [], "children": [1], "siblings": [], "batch_size": 4, "conv_type": 6, "kernel_size": 7, "h_pad": 3, "v_pad": 3,  "stride": 2, "input_width": 224,  "input_height": 224,  "input_channel": 3,  "output_channel": 64,  "input_addr": "0x1",  "filter_addr": "0x2",  "quant_addr": "0x3", "output_addr": "0x4", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
+  {"op": "null", "runid": 1, "instr_name": "ACTIVE", "parents": [0], "children": [2], "siblings": [], "batch_size": 3, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 0, "output_height": 0, "output_channel": 0, "input_addr": "0x6", "quant_addr": "0x8", "output_addr": "0x7", "input_size": 8, "output_size": 8},
+  {"op": "null", "runid": 2, "instr_name": "POOL", "parents": [1], "children": [3, 4], "siblings": [], "batch_size": 2, "pool_type": 3, "kernel_size": 5, "stride": 1, "input_width": 57, "input_height": 58, "input_channel": 59, "input_addr": "0x81", "quant_addr": "0x8", "output_addr": "0x91", "input_size": 8, "output_size": 8},
+  {"op": "null", "runid": 3, "instr_name": "CONV", "parents": [2], "children": [9], "siblings": [4], "batch_size": 4, "conv_type": 1, "kernel_size": 1, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x10", "filter_addr": "0x11", "quant_addr": "0x12", "output_addr": "0x13", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
+  {"op": "null", "runid": 4, "instr_name": "CONV", "parents": [2], "children": [5], "siblings": [3], "batch_size": 3, "conv_type": 6, "kernel_size": 1, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x14", "filter_addr": "0x15", "quant_addr": "0x16", "output_addr": "0x17", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
+  {"op": "null", "runid": 5, "instr_name": "ACTIVE", "parents": [4], "children": [6], "siblings": [], "batch_size": 4, "active_type": 2, "input_width": 53, "input_height": 54, "input_channel": 55, "output_width": 1, "output_height": 2, "output_channel": 3, "input_addr": "0x54", "quant_addr": "0x8", "output_addr": "0x55", "input_size": 8, "output_size": 8},
+  {"op": "null", "runid": 6, "instr_name": "CONV", "parents": [5], "children": [7], "siblings": [], "batch_size": 4, "conv_type": 6, "kernel_size": 3, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x18", "filter_addr": "0x19", "quant_addr": "0x20", "output_addr": "0x21", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
+  {"op": "null", "runid": 7, "instr_name": "ACTIVE", "parents": [6], "children": [8], "siblings": [], "batch_size": 4, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 0, "output_height": 0, "output_channel": 0, "input_addr": "0x22", "quant_addr": "0x8", "output_addr": "0x23", "input_size": 25690112, "output_size": 25690112 },
+  {"op": "null", "runid": 8, "instr_name": "CONV", "parents": [7], "children": [9], "siblings": [], "batch_size": 4, "conv_type": 6, "kernel_size": 1, "h_pad": 0, "v_pad": 0, "stride": 1, "input_width": 56, "input_height": 56, "input_channel": 64, "output_channel": 256, "input_addr": "0x24", "filter_addr": "0x25", "quant_addr": "0x26", "output_addr": "0x27", "input_size": 8, "output_size": 8, "width_shift": 0, "width_size": 0, "height_shift": 0, "height_size": 0, "channel_shift": 0 },
+  {"op": "null", "runid": 9, "instr_name": "ELTWISE", "parents": [3, 8], "children": [10], "siblings": [], "batch_size": 3, "input_width": 200, "input_height": 201, "input_channel": 202, "input_addr": "0x98", "residual_addr": "0x99", "quant_addr": "0x8", "output_addr": "0x100", "input_size": 8, "output_size": 8},
+  {"op": "null", "runid": 10, "instr_name": "ACTIVE", "parents": [9], "children": [], "siblings": [], "batch_size": 4, "active_type": 1, "input_width": 112, "input_height": 112, "input_channel": 64, "output_width": 0, "output_height": 0, "output_channel": 0, "input_addr": "0x31", "quant_addr": "0x8", "output_addr": "0x32", "input_size": 25690112, "output_size": 25690112 }
 ]
--- a/tools/install_inna.sh
+++ b/tools/install_inna.sh
@ -15,7 +15,7 @@ cd ..

 cd python; python setup.py install; cd ..
 cd topi/python; python setup.py install; cd ../..
-#cd nnvm/python; python setup.py install; cd ../..
+cd nnvm/python; python setup.py install; cd ../..

 cd ..