add channel wise quantization in ir pass.

revert-16190-refine_parallel_executor
Zhen Wang 6 years ago
parent 81b4fad8b9
commit ec88b6cc5a

@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
@ -54,10 +55,6 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
auto scales = ctx.MultiInput<framework::Tensor>("Scales"); auto scales = ctx.MultiInput<framework::Tensor>("Scales");
auto* out = ctx.Output<framework::Tensor>("Out"); auto* out = ctx.Output<framework::Tensor>("Out");
PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
"The number of first scale values must be the same with "
"first dimension value of Input(X).");
auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits"); auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
int max_range = std::pow(2, quant_bits[0] - 1) - 1; int max_range = std::pow(2, quant_bits[0] - 1) - 1;
@ -65,15 +62,38 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
out->mutable_data<T>(dev_ctx.GetPlace()); out->mutable_data<T>(dev_ctx.GetPlace());
auto dequant = DequantizeFunctor<DeviceContext, T>(); auto dequant = DequantizeFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) { if (scales.size() == 1) {
framework::Tensor one_channel_in = in->Slice(i, i + 1); PADDLE_ENFORCE_EQ(
framework::Tensor one_channel_out = out->Slice(i, i + 1); scales[0]->numel(), in->dims()[0],
framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); "The number of first scale values must be the same with "
dequant(dev_ctx, &one_channel_in, &one_channel_scale, "first dimension value of Input(X) when the `Scales` has only one "
static_cast<T>(max_range), &one_channel_out); "element.");
} for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
if (scales.size() == 2) { framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
dequant(dev_ctx, &one_channel_in, &one_channel_scale,
static_cast<T>(max_range), &one_channel_out);
}
} else if (scales.size() == 2) {
PADDLE_ENFORCE_EQ(
scales[0]->numel(), in->dims()[1],
"The number of first scale values must be the same with "
"second dimension value of Input(X) when the `Scales` has two "
"elements.");
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
framework::slice_ddim(in->dims(), 1, in->dims().size()));
framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
framework::slice_ddim(out->dims(), 1, out->dims().size()));
for (int64_t j = 0; j < in->dims()[1]; j++) {
framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
framework::Tensor one_channel_scale = scales[0]->Slice(j, j + 1);
dequant(dev_ctx, &one_channel_in, &one_channel_scale,
static_cast<T>(max_range), &one_channel_out);
}
}
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
scales[1]->numel(), 1, scales[1]->numel(), 1,
"The second scale tensor should only have one value at now."); "The second scale tensor should only have one value at now.");

@ -169,10 +169,10 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
ctx->HasOutput("Out"), ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseQuantizeOp should not be null."); "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("OutScales"), ctx->HasOutput("OutScale"),
"Output(Scales) of FakeChannelWiseQuantizeOp should not be null."); "Output(Scale) of FakeChannelWiseQuantizeOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]}); ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
@ -192,7 +192,7 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
AddOutput("Out", AddOutput("Out",
"(Tensor) Output of quantized low level tensor, " "(Tensor) Output of quantized low level tensor, "
"but also saved as float data type."); "but also saved as float data type.");
AddOutput("OutScales", "(Tensor) Current channel wise scale"); AddOutput("OutScale", "(Tensor) Current channel wise scale");
AddAttr<int>("bit_length", "(int, default 8)") AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8) .SetDefault(8)
.AddCustomChecker([](const int& bit_length) { .AddCustomChecker([](const int& bit_length) {

@ -78,8 +78,8 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
auto* out_scales = context.Output<framework::Tensor>("OutScales"); auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace()); T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
int bit_length = context.Attr<int>("bit_length"); int bit_length = context.Attr<int>("bit_length");
@ -91,13 +91,13 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
framework::Tensor one_channel = in->Slice(i, i + 1); framework::Tensor one_channel = in->Slice(i, i + 1);
const T* one_channel_data = one_channel.data<T>(); const T* one_channel_data = one_channel.data<T>();
find_abs_max(dev_ctx, one_channel_data, one_channel.numel(), find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
&out_scales_data[i]); &out_scale_data[i]);
} }
auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>(); auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) { for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1); framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1); framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1); framework::Tensor one_channel_scale = out_scale->Slice(i, i + 1);
clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt, clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
&one_channel_out); &one_channel_out);
} }

@ -243,7 +243,12 @@ class TestQuantizationFreezePass(unittest.TestCase):
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
exe.run(startup) exe.run(startup)
transform_pass = QuantizationTransformPass( transform_pass = QuantizationTransformPass(
scope=scope, place=place, activation_quantize_type=quant_type) scope=scope,
place=place,
activation_quantize_type=quant_type,
weight_quantize_type='channel_wise_abs_max')
#transform_pass = QuantizationTransformPass(
# scope=scope, place=place, activation_quantize_type=quant_type)
transform_pass.apply(main_graph) transform_pass.apply(main_graph)
transform_pass.apply(test_graph) transform_pass.apply(test_graph)
dev_name = '_gpu_' if use_cuda else '_cpu_' dev_name = '_gpu_' if use_cuda else '_cpu_'
@ -296,7 +301,11 @@ class TestQuantizationFreezePass(unittest.TestCase):
fetch_list=[loss, w_var]) fetch_list=[loss, w_var])
# Freeze graph for inference, but the weight of fc/conv is still float type. # Freeze graph for inference, but the weight of fc/conv is still float type.
freeze_pass = QuantizationFreezePass(scope=scope, place=place) freeze_pass = QuantizationFreezePass(
scope=scope,
place=place,
weight_quantize_type='channel_wise_abs_max')
#freeze_pass = QuantizationFreezePass(scope=scope, place=place)
freeze_pass.apply(test_graph) freeze_pass.apply(test_graph)
if not for_ci: if not for_ci:
marked_nodes = set() marked_nodes = set()
@ -375,29 +384,32 @@ class TestQuantizationFreezePass(unittest.TestCase):
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
True, seed=1, quant_type='abs_max', for_ci=True) True, seed=1, quant_type='abs_max', for_ci=False)
def test_freeze_graph_cpu_dynamic(self): def test_freeze_graph_cpu_dynamic(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph(False, seed=2, quant_type='abs_max', for_ci=True) self.freeze_graph(False, seed=2, quant_type='abs_max', for_ci=False)
def test_freeze_graph_cuda_static(self): def test_freeze_graph_cuda_static(self):
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
True, seed=1, quant_type='range_abs_max', for_ci=True) True, seed=1, quant_type='range_abs_max', for_ci=False)
self.freeze_graph( self.freeze_graph(
True, True,
seed=1, seed=1,
quant_type='moving_average_abs_max', quant_type='moving_average_abs_max',
for_ci=True) for_ci=False)
def test_freeze_graph_cpu_static(self): def test_freeze_graph_cpu_static(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
False, seed=2, quant_type='range_abs_max', for_ci=True) False, seed=2, quant_type='range_abs_max', for_ci=False)
self.freeze_graph( self.freeze_graph(
False, seed=2, quant_type='moving_average_abs_max', for_ci=True) False,
seed=2,
quant_type='moving_average_abs_max',
for_ci=False)
if __name__ == '__main__': if __name__ == '__main__':

@ -31,15 +31,27 @@ def dequantize_max_abs(x, scale, max_range):
return y return y
def channel_wise_quantize_max_abs(x, quant_bit=8): def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False):
scales = [] scales = []
for i in range(x.shape[0]): if not use_second_dim:
scales.append(np.max(np.abs(x[i])).astype("float32")) for i in range(x.shape[0]):
scales.append(np.max(np.abs(x[i])).astype("float32"))
y = x.copy() y = x.copy()
max_range = math.pow(2, quant_bit - 1) - 1 max_range = math.pow(2, quant_bit - 1) - 1
for i, scale in enumerate(scales): for i, scale in enumerate(scales):
y[i] = np.round(y[i] / scale * max_range) y[i] = np.round(x[i] / scale * max_range)
else:
for i in range(x.shape[0]):
s = []
for j in range(x.shape[1]):
s.append(np.max(np.abs(x[i][j])).astype("float32"))
scales.append(s)
scales = np.amax(np.array(scales), axis=0)
y = x.copy()
max_range = math.pow(2, quant_bit - 1) - 1
for i in range(x.shape[0]):
for j, scale in enumerate(scales):
y[i][j] = np.round(x[i][j] / scale * max_range)
return y, scales return y, scales
@ -47,10 +59,16 @@ def channel_wise_dequantize_max_abs(x,
scales, scales,
quant_bits, quant_bits,
activation_scale=None): activation_scale=None):
y = x.copy() if activation_scale is None:
for i in range(x.shape[0]): y = x.copy()
y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i] for i in range(x.shape[0]):
if activation_scale is not None: y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i]
else:
y = x.copy()
for i in range(x.shape[0]):
for j in range(x.shape[1]):
y[i][j] = (scales[j] /
(math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j]
y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1) y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
return y return y
@ -65,7 +83,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
self.set_args() self.set_args()
self.op_type = "fake_channel_wise_dequantize_max_abs" self.op_type = "fake_channel_wise_dequantize_max_abs"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type) x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0]) yq, scales = channel_wise_quantize_max_abs(
x, self.quant_bits[0], use_second_dim=True)
ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
self.activation_scale) self.activation_scale)

@ -53,7 +53,7 @@ class TestFakeChannelWiseQuantizeOp(OpTest):
self.outputs = { self.outputs = {
'Out': outputs, 'Out': outputs,
'OutScales': np.array(scales).astype("float32"), 'OutScale': np.array(scales).astype("float32"),
} }
def test_check_output(self): def test_check_output(self):

Loading…
Cancel
Save