You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
558 lines
22 KiB
558 lines
22 KiB
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import collections
|
|
import numpy as np
|
|
|
|
from paddle.fluid.framework import default_main_program, default_startup_program, program_guard
|
|
from paddle.fluid.layer_helper import LayerHelper
|
|
from paddle.fluid import unique_name
|
|
from paddle.fluid import core
|
|
from paddle.fluid.initializer import Constant
|
|
from paddle.fluid.param_attr import ParamAttr
|
|
from paddle.fluid.layer_helper import LayerHelper
|
|
from paddle.fluid.layers.nn import autoincreased_step_counter
|
|
from paddle.fluid.framework import Variable
|
|
from paddle.fluid.executor import global_scope
|
|
from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler
|
|
|
|
__all__ = ['QuantizeTranspiler']
|
|
|
|
_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
|
|
|
|
|
|
def _quantized_var_name(var_name):
|
|
"""
|
|
Return quantized variable name for the input `var_name`.
|
|
"""
|
|
return "%s.quantized" % (var_name)
|
|
|
|
|
|
def _dequantized_var_name(var_name):
|
|
"""
|
|
Return dequantized variable name for the input `var_name`.
|
|
"""
|
|
return "%s.dequantized" % (var_name)
|
|
|
|
|
|
def _quantized_scale_name(var_name):
|
|
"""
|
|
Return quantized variable name for the input `var_name`.
|
|
"""
|
|
return "%s.scale" % (var_name)
|
|
|
|
|
|
def _original_var_name(var_name):
|
|
"""
|
|
Return the original variable name.
|
|
"""
|
|
if var_name.endswith('.quantized.dequantized'):
|
|
return var_name[:-len('.quantized.dequantized')]
|
|
if var_name.endswith('.quantized'):
|
|
return var_name[:-len('.quantized')]
|
|
if var_name.endswith('.dequantized'):
|
|
return var_name[:-len('.dequantized')]
|
|
if var_name.endswith('.scale'):
|
|
return var_name[:-len('.scale')]
|
|
else:
|
|
return var_name
|
|
|
|
|
|
def _is_float(v):
|
|
return isinstance(v, float) or isinstance(v, np.float32)
|
|
|
|
|
|
def quant(x, scale, num_bits):
|
|
y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
|
|
return y
|
|
|
|
|
|
class QuantizeTranspiler(object):
|
|
def __init__(self,
|
|
weight_bits=8,
|
|
activation_bits=8,
|
|
activation_quantize_type='abs_max',
|
|
weight_quantize_type='abs_max',
|
|
window_size=10000):
|
|
"""
|
|
Convert and rewrite the fluid Program according to weight and
|
|
activation quantization type.
|
|
|
|
Args:
|
|
weight_bits (int): quantization bit number for weights,
|
|
the bias is not quantized.
|
|
activation_bits (int): quantization bit number for activation.
|
|
activation_quantize_type (str): quantization type for activation,
|
|
now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
|
|
the quantization scale will be calculated dynamically each step
|
|
in both training and testing period. If use 'range_abs_max',
|
|
a static quantization scale will be calculated during training
|
|
and used in inference.
|
|
weight_quantize_type (str): quantization type for weights,
|
|
support 'abs_max'. The 'range_abs_max' usually is not used for
|
|
weight, since weights are fixed once the model is well trained.
|
|
window_size (int): the window size for 'range_abs_max' quantization.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
# the original program will be rewrite, if you don't want to
|
|
# change it, please clone at first.
|
|
# quantize_program = program.clone()
|
|
t = fluid.QuantizeTranspiler()
|
|
t.transpile(quantize_program)
|
|
|
|
"""
|
|
self.weight_bits = weight_bits
|
|
self.activation_bits = activation_bits
|
|
quant_type = ['abs_max', 'range_abs_max']
|
|
if weight_quantize_type not in quant_type:
|
|
raise ValueError(
|
|
"Unknown weight_quantize_type: '%s'. It can only be ",
|
|
"'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
|
|
if activation_quantize_type not in quant_type:
|
|
raise ValueError(
|
|
"Unknown activation_quantize_type : '%s'. It can only be ",
|
|
"'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
|
|
|
|
self.weight_quantize_type = weight_quantize_type
|
|
self.activation_quantize_type = activation_quantize_type
|
|
|
|
self.window_size = window_size
|
|
self.helper = LayerHelper(self.__class__.__name__)
|
|
self.fake_quant_op_types = [
|
|
'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
|
|
]
|
|
self.fake_dequant_op_types = ['fake_dequantize_max_abs']
|
|
self.is_test = None
|
|
self.global_step = None
|
|
|
|
def training_transpile(self, program=None, startup_program=None):
|
|
"""Rewrites a training input program in place for simulated
|
|
quantization. Insert fake quantization and de-quantization ops into
|
|
program to simulate the error introduced by quantization. And change
|
|
the graident ops' input by using the faked quantization weights and
|
|
activation. Since the program is transformed in place, the graph
|
|
connection will change.
|
|
|
|
Args:
|
|
program (Program): the input program to be transpile.
|
|
"""
|
|
self.is_test = False
|
|
program = default_main_program() if program is None else program
|
|
startup_program = default_startup_program() if startup_program is \
|
|
None else startup_program
|
|
|
|
# marked the variable which has been quantized and dequantized.
|
|
dequanted_vars = [
|
|
collections.OrderedDict() for _ in range(len(program.blocks))
|
|
]
|
|
grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES]
|
|
|
|
params = [p.name for p in program.global_block().iter_parameters()]
|
|
|
|
def _transpile_forward(block, op):
|
|
idx = block.ops.index(op)
|
|
block_id = block.idx
|
|
# insert quant op and dequant op
|
|
for name in op.input_arg_names:
|
|
if name in dequanted_vars[block_id]:
|
|
dequant_var = dequanted_vars[block_id][name]
|
|
else:
|
|
var = block.var(name)
|
|
quant_bits = self.weight_bits if var.name in params \
|
|
else self.activation_bits
|
|
quant_type = self.weight_quantize_type if var.name \
|
|
in params else self.activation_quantize_type
|
|
|
|
quant_var, scale_var = self._insert_quant_op(
|
|
block, idx, var, quant_bits, quant_type)
|
|
dequant_var = self._insert_dequant_op(
|
|
block, idx + 1, quant_var, scale_var, quant_bits)
|
|
dequanted_vars[block_id][name] = dequant_var
|
|
# rename the forward op inputs
|
|
op._rename_input(name, dequant_var.name)
|
|
|
|
def _transpile_backward(block, op):
|
|
block_id = block.idx
|
|
no_dequanted_input_vars = True
|
|
for name in op.input_arg_names:
|
|
if name in dequanted_vars[block_id]:
|
|
dequant_var = dequanted_vars[block_id][name]
|
|
op._rename_input(name, dequant_var.name)
|
|
no_dequanted_input_vars = False
|
|
if no_dequanted_input_vars:
|
|
raise ValueError("There is no dequanted inputs for op %s." %
|
|
(op.type))
|
|
|
|
with program_guard(program, startup_program):
|
|
self._create_global_step()
|
|
for block in program.blocks:
|
|
ops = list(block.ops)
|
|
block_id = block.idx
|
|
for op in ops:
|
|
# rewrite the forward ProgramDes
|
|
if op.type in _QUANTIZABLE_OP_TYPES:
|
|
_transpile_forward(block, op)
|
|
# rename the backward op inputs
|
|
if op.type in grad_op_types:
|
|
_transpile_backward(block, op)
|
|
|
|
def _create_global_step(self):
|
|
if self.weight_quantize_type == 'range_abs_max' or \
|
|
self.activation_quantize_type == 'range_abs_max':
|
|
self.global_step = autoincreased_step_counter()
|
|
|
|
def freeze_program(self, program, place, fuse_bn=False, scope=None):
|
|
"""Freeze input training program for inference.
|
|
|
|
Args:
|
|
program (Program): the input program to be transpile.
|
|
"""
|
|
|
|
self.is_test = True
|
|
scope = global_scope() if scope is None else scope
|
|
program = default_main_program() if program is None else program
|
|
|
|
if fuse_bn:
|
|
bn_fuse_transpiler = BNFuseTranspiler()
|
|
bn_fuse_transpiler.transpile(program, place)
|
|
|
|
persistable_vars = [
|
|
v.name
|
|
for v in filter(lambda var: var.persistable, program.list_vars())
|
|
]
|
|
op_in_rename_map = [
|
|
collections.OrderedDict() for _ in range(len(program.blocks))
|
|
]
|
|
op_out_rename_map = [
|
|
collections.OrderedDict() for _ in range(len(program.blocks))
|
|
]
|
|
var_scale_map = [
|
|
collections.OrderedDict() for _ in range(len(program.blocks))
|
|
]
|
|
|
|
def _remove_fake_quant_and_dequant_op(block, op):
|
|
idx = block.ops.index(op)
|
|
block_id = block.idx
|
|
k = op.output('Out')[0]
|
|
v = op.input('X')[0]
|
|
if v not in op_in_rename_map[block_id]:
|
|
op_in_rename_map[block_id][k] = v
|
|
else:
|
|
op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v]
|
|
block._remove_op(idx)
|
|
|
|
def _insert_post_dequant_op(block, op):
|
|
idx = block.ops.index(op)
|
|
block_id = block.idx
|
|
max_range = None
|
|
scale_var = None
|
|
for name in op.input_arg_names:
|
|
if name in op_in_rename_map[block_id]:
|
|
op._rename_input(name, op_in_rename_map[block_id][name])
|
|
|
|
scale_v = var_scale_map[block_id][_original_var_name(name)]
|
|
if _original_var_name(name) in persistable_vars:
|
|
param_range = (1 << (self.weight_bits - 1)) - 1
|
|
act_range = (1 << (self.activation_bits - 1)) - 1
|
|
assert _is_float(scale_v)
|
|
max_range = param_range * act_range / scale_v
|
|
else:
|
|
assert isinstance(scale_v, Variable)
|
|
scale_var = var_scale_map[block_id][_original_var_name(
|
|
name)]
|
|
|
|
if len(op.output_arg_names) != 1:
|
|
raise ValueError("Only support one output, but op %s has"
|
|
" more than one output." % (op.type))
|
|
out_var = block.var(op.output_arg_names[0])
|
|
dequant_var = block.create_var(
|
|
name=_dequantized_var_name(out_var.name),
|
|
type=out_var.type,
|
|
shape=out_var.shape,
|
|
dtype=out_var.dtype)
|
|
# insert fake_dequantize_op
|
|
dequant_op = block._insert_op(
|
|
idx + 1,
|
|
type="fake_dequantize_max_abs",
|
|
attrs={'max_range': float(max_range)},
|
|
inputs={"X": out_var,
|
|
'Scale': scale_var},
|
|
outputs={"Out": dequant_var})
|
|
op_out_rename_map[block_id][out_var.name] = dequant_var.name
|
|
return dequant_var
|
|
|
|
def _load_var(name):
|
|
return np.array(scope.find_var(name).get_tensor())
|
|
|
|
def _restore_var(name, arr):
|
|
t = scope.find_var(name).get_tensor()
|
|
t.set(arr, place)
|
|
|
|
for block in program.blocks:
|
|
ops = list(block.ops)
|
|
block_id = block.idx
|
|
for op in ops:
|
|
op_type = op.type
|
|
|
|
# insert dequant_op after fc/conv, need to rename
|
|
# input of the followed ops
|
|
for name in op.input_arg_names:
|
|
if name in op_out_rename_map[block_id]:
|
|
op._rename_input(name,
|
|
op_out_rename_map[block_id][name])
|
|
|
|
if op_type in self.fake_quant_op_types:
|
|
in_arg_name = op.input('X')[0]
|
|
if in_arg_name in persistable_vars:
|
|
if self.weight_quantize_type == 'abs_max':
|
|
param = _load_var(in_arg_name)
|
|
scale_v = np.max(np.abs(param))
|
|
else:
|
|
scale_v = _load_var(op.output('OutScale')[0])
|
|
var_scale_map[block_id][in_arg_name] = scale_v
|
|
else:
|
|
scale_v = block.var(op.output('OutScale')[0])
|
|
var_scale_map[block_id][in_arg_name] = scale_v
|
|
|
|
if in_arg_name in persistable_vars:
|
|
_remove_fake_quant_and_dequant_op(block, op)
|
|
# quantize weight and restore
|
|
param_t = _load_var(in_arg_name)
|
|
param_q_t = quant(param_t, scale_v, self.weight_bits)
|
|
_restore_var(in_arg_name, param_q_t)
|
|
|
|
if op_type in self.fake_dequant_op_types:
|
|
_remove_fake_quant_and_dequant_op(block, op)
|
|
|
|
if op_type in _QUANTIZABLE_OP_TYPES:
|
|
dequant_var = _insert_post_dequant_op(block, op)
|
|
|
|
# remove the unused var in ProgramDesc
|
|
self._remove_unused_var(program)
|
|
#program = program.clone()
|
|
|
|
def convert_to_int8(self, program, place, scope=None):
|
|
scope = global_scope() if scope is None else scope
|
|
program = default_main_program() if program is None else program
|
|
|
|
def _load_var(name):
|
|
return np.array(scope.find_var(name).get_tensor())
|
|
|
|
global_block = program.global_block()
|
|
|
|
def convert_to_int8(var):
|
|
int8_var_name = var.name + ".int8"
|
|
int8_var = global_block.create_parameter(
|
|
name=int8_var_name.encode('ascii'),
|
|
type=var.type,
|
|
dtype=core.VarDesc.VarType.INT8,
|
|
shape=var.shape)
|
|
|
|
tensor = _load_var(var.name)
|
|
|
|
scope.var(int8_var_name)
|
|
int8_tensor = scope.find_var(int8_var_name).get_tensor()
|
|
int8_tensor.set(tensor.astype(np.int8), place)
|
|
return int8_var
|
|
|
|
input_map = {}
|
|
for block in program.blocks:
|
|
for op in list(block.ops):
|
|
if op.type in _QUANTIZABLE_OP_TYPES:
|
|
for name in op.input_arg_names:
|
|
var = block.var(name)
|
|
if var.persistable:
|
|
if name not in input_map:
|
|
int8_var = convert_to_int8(var)
|
|
input_map[name] = int8_var.name
|
|
op._rename_input(name, input_map[name])
|
|
self._remove_unused_var(program)
|
|
|
|
def _remove_unused_var(self, program):
|
|
all_remove_vars = []
|
|
for block in program.blocks:
|
|
args = []
|
|
for op in block.ops:
|
|
args += op.input_arg_names
|
|
args += op.output_arg_names
|
|
args = list(set(args))
|
|
var_names = block.vars.keys()
|
|
sub_block_remove_vars = []
|
|
for var in var_names:
|
|
if var not in args:
|
|
sub_block_remove_vars.append(var)
|
|
all_remove_vars.append(sub_block_remove_vars)
|
|
|
|
remove_vars = [list(set(v)) for v in all_remove_vars]
|
|
for i, block in enumerate(program.blocks):
|
|
for v in remove_vars[i]:
|
|
block._remove_var(v)
|
|
|
|
def _insert_quant_abs_max_op(self, block, idx, var, quant_bits):
|
|
"""Insert fake_quantize_abs_max op.
|
|
"""
|
|
quant_var = block.create_var(
|
|
name=_quantized_var_name(var.name),
|
|
type=var.type,
|
|
shape=var.shape,
|
|
dtype=var.dtype)
|
|
scale = block.create_var(
|
|
name=_quantized_scale_name(var.name),
|
|
type=var.type,
|
|
shape=var.shape,
|
|
dtype=var.dtype)
|
|
quant_op = block._insert_op(
|
|
idx,
|
|
type='fake_quantize_abs_max',
|
|
attrs={'bit_length': quant_bits},
|
|
inputs={'X': var},
|
|
outputs={'Out': quant_var,
|
|
'OutScale': scale})
|
|
return quant_var, scale
|
|
|
|
def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
|
|
"""Insert fake_quantize_range_abs_max
|
|
"""
|
|
quant_var = block.create_var(
|
|
name=_quantized_var_name(var.name),
|
|
type=var.type,
|
|
shape=var.shape,
|
|
dtype=var.dtype)
|
|
scale = self.helper.create_parameter(
|
|
attr=ParamAttr(
|
|
name=_quantized_scale_name(var.name),
|
|
initializer=Constant(0.001),
|
|
trainable=False),
|
|
shape=[1],
|
|
dtype=var.dtype)
|
|
scale.stop_gradient = True
|
|
|
|
ins = {'X': var, 'InScale': scale}
|
|
outs = {'Out': quant_var, 'OutScale': scale}
|
|
if not self.is_test:
|
|
# A global step counter variable with type int64
|
|
scales = self.helper.create_global_variable(
|
|
name=unique_name.generate('scales'),
|
|
persistable=True,
|
|
dtype=var.dtype,
|
|
shape=[self.window_size])
|
|
self.helper.set_variable_initializer(
|
|
scales, initializer=Constant(value=0))
|
|
|
|
ins['Iter'] = self.global_step
|
|
outs['OutScales'] = scales
|
|
|
|
attrs = {
|
|
'window_size': self.window_size,
|
|
'bit_length': quant_bits,
|
|
'is_test': self.is_test
|
|
}
|
|
|
|
quant_op = block._insert_op(
|
|
idx,
|
|
type='fake_quantize_range_abs_max',
|
|
attrs=attrs,
|
|
inputs=ins,
|
|
outputs=outs)
|
|
|
|
return quant_var, scale
|
|
|
|
def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
|
|
"""
|
|
Insert fake_quantize_op
|
|
"""
|
|
if quant_type == 'abs_max':
|
|
return self._insert_quant_abs_max_op(block, idx, var, quant_bits)
|
|
elif quant_type == 'range_abs_max':
|
|
return self._insert_quant_range_abs_max_op(block, idx, var,
|
|
quant_bits)
|
|
|
|
def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
|
|
"""
|
|
Insert fake_quantize_op
|
|
"""
|
|
dequant_var = block.create_var(
|
|
name=_dequantized_var_name(var.name),
|
|
type=var.type,
|
|
shape=var.shape,
|
|
dtype=var.dtype)
|
|
# insert fake_dequantize_op
|
|
max_range = (1 << (quant_bits - 1)) - 1
|
|
dequant_op = block._insert_op(
|
|
idx,
|
|
type="fake_dequantize_max_abs",
|
|
attrs={'max_range': float(max_range)},
|
|
inputs={"X": var,
|
|
'Scale': scale},
|
|
outputs={"Out": dequant_var})
|
|
return dequant_var
|
|
|
|
|
|
class BNFuseTranspiler(InferenceTranspiler):
|
|
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
|
|
def _update_param(op, param_name, new_param):
|
|
var = self.block.vars[param_name]
|
|
tensor = self.scope.find_var(param_name).get_tensor()
|
|
tensor.set(np.array(new_param), self.place)
|
|
|
|
def _load_param(param_name):
|
|
return np.array(self.scope.find_var(param_name).get_tensor())
|
|
|
|
bias_bn = _load_param(bn_op.input("Bias")[0]) #Bias
|
|
scale_bn = _load_param(bn_op.input("Scale")[0]) #Scale
|
|
mean_bn = _load_param(bn_op.input("Mean")[0]) #Mean
|
|
var_bn = _load_param(bn_op.input("Variance")[0]) #Variance
|
|
|
|
if current_op.type in ['conv2d', 'depthwise_conv2d']:
|
|
current_param = _load_param(
|
|
_original_var_name(current_op.input("Filter")[0]))
|
|
elif current_op.type == 'mul':
|
|
current_param = _load_param(
|
|
_original_var_name(current_op.input("Y")[0]))
|
|
|
|
std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
|
|
tmp = np.float32(np.divide(scale_bn, std_bn))
|
|
|
|
# add bias of batch_norm_op to conv2d
|
|
if with_bias:
|
|
bias = _load_param(bias_op.input("Y"))
|
|
else:
|
|
bias = np.zeros(bias_bn.shape)
|
|
bias = np.float32(
|
|
np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
|
|
|
|
# re-compute weight of conv2d/fc
|
|
tmp = tmp.reshape(tmp.shape[0], -1)
|
|
dst_param = current_param.reshape((tmp.shape[0], -1))
|
|
dst_param = np.float32(np.multiply(dst_param, tmp))
|
|
dst_param = dst_param.reshape(current_param.shape)
|
|
|
|
# update parameters
|
|
if current_op.type in ['conv2d', 'depthwise_conv2d']:
|
|
_update_param(current_op,
|
|
_original_var_name(current_op.input("Filter")[0]),
|
|
dst_param)
|
|
elif current_op.type == 'mul':
|
|
_update_param(current_op,
|
|
_original_var_name(current_op.input("Y")[0]),
|
|
dst_param)
|
|
|
|
_update_param(bias_op, bias_op.input("Y")[0], bias)
|
|
|
|
# collect the renamed input
|
|
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
|