You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/paddle/fluid/inference/api/mkldnn_quantizer.cc

438 lines
16 KiB

// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#include <algorithm>
#include <map>
#include <numeric>
#include <unordered_map>
#include <utility>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
using platform::CPUPlace;
using framework::LoDTensor;
using framework::ir::Graph;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using string::PrettyLogH1;
bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
PrettyLogH1("--- Calculating scales for quantization");
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
if (op->HasAttr("use_quantizer") &&
boost::get<bool>(op->GetAttr("use_quantizer"))) {
const VariableNameMap& connections_in = op->Inputs();
const VariableNameMap& connections_out = op->Outputs();
auto glambda = [&](const VariableNameMap& connections, bool is_output) {
for (auto const& conn : connections) {
if (conn.second.size() == 0) continue;
auto& var_name = conn.second[0];
// skip if scale already computed
if (scales_.find(var_name) != scales_.end()) return;
auto* var = predictor_.sub_scope_->FindVar(var_name);
PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
PADDLE_ENFORCE(var->IsType<LoDTensor>(),
"Only support lod tensor now.");
LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
// force unsigned type if already know it
bool is_unsigned = false;
if (is_output && op->Type() == "conv2d") {
// output of conv2d with relu must be unsigned
is_unsigned = op->HasAttr("fuse_relu") &&
boost::get<bool>(op->GetAttr("fuse_relu"));
} else if (is_output && op->Type() == "pool2d") {
// output of pool2d with unsigned input must be unsigned
auto input_var_name = op->Input("X")[0];
if (scales_.find(input_var_name) != scales_.end()) {
is_unsigned = scales_[input_var_name].first;
}
}
CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor,
is_unsigned);
}
};
// handle outputs first so unsigned outputs could be inferred
glambda(connections_out, true /* is_output */);
glambda(connections_in, false /* is_output */);
}
}
return true;
}
void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
const std::string& op_type_name, const std::string& conn_name,
const std::string& var_name, const LoDTensor& var_tensor,
bool is_unsigned) {
auto rule = qconfig_->scale_algo(op_type_name, conn_name);
if (rule == ScaleAlgo::NONE) return;
PADDLE_ENFORCE(
var_tensor.numel() > 0,
"MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
"%s of connection %s should not be empty.",
var_name, op_type_name, conn_name);
switch (rule) {
case ScaleAlgo::MAX:
scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned);
break;
case ScaleAlgo::MAX_CH:
scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned);
break;
case ScaleAlgo::KL:
scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned);
break;
default:
throw std::runtime_error(
"MkldnnQuantizer: Unexpected ScaleAlgo specified.");
}
}
std::vector<int> AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins(
std::vector<int> quantized_bins, std::vector<int> reference_bins) const {
std::vector<int> expanded_quantized_bins(reference_bins.size(), 0);
int num_merged_bins = reference_bins.size() / quantized_bins.size();
int j_start = 0;
int j_end = num_merged_bins;
for (size_t idx = 0; idx < quantized_bins.size(); idx++) {
int zero_count =
std::count(&reference_bins[j_start], &reference_bins[j_end], 0);
num_merged_bins = j_end - j_start;
int avg_bin_ele;
if (zero_count == num_merged_bins) {
avg_bin_ele = 0;
} else {
avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0);
}
for (int idx1 = j_start; idx1 < j_end; idx1++) {
expanded_quantized_bins[idx1] =
(reference_bins[idx1] == 0) ? 0 : avg_bin_ele;
}
j_start += num_merged_bins;
j_end += num_merged_bins;
if ((idx + 1) == quantized_bins.size() - 1) {
j_end = reference_bins.size();
}
}
return expanded_quantized_bins;
}
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned) const {
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
int precision_hist_num_bins = 2048;
float max_val = eigen_tensor.maxCoeff();
float min_val = eigen_tensor.minCoeff();
bool is_positive = min_val >= 0.0f;
if (is_unsigned)
PADDLE_ENFORCE(
is_positive,
"Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
min_val);
int num_quantized_bins = 255;
std::vector<int> hist;
float bin_width;
int starting_iter;
int ending_iter = precision_hist_num_bins - 1;
if (is_positive) {
std::tie(hist, bin_width) =
Histogram(var_tensor, min_val, max_val, precision_hist_num_bins);
starting_iter = static_cast<int>(ending_iter * 0.7);
} else {
float th = std::max(std::abs(max_val), std::abs(min_val));
std::tie(hist, bin_width) =
Histogram(var_tensor, -th, th, precision_hist_num_bins);
starting_iter = 0;
if (std::abs(max_val) > std::abs(min_val)) {
while (starting_iter < ending_iter) {
if (hist[starting_iter] == 0) {
++starting_iter;
continue;
} else {
break;
}
}
starting_iter += static_cast<int>((ending_iter - starting_iter) * 0.6);
} else {
while (ending_iter > 0) {
if (hist[ending_iter] == 0) {
--ending_iter;
continue;
} else {
break;
}
}
starting_iter = static_cast<int>(0.6 * ending_iter);
}
}
auto P_sum = eigen_tensor.size();
int min_kl_divergence = 0;
int min_kl_index = 0;
bool kl_inited = false;
for (int i = starting_iter; i <= ending_iter; i++) {
std::vector<int> reference_distr_P(&hist[0], &hist[i]);
auto outliers_count =
std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0);
if (reference_distr_P[i - 1] == 0) {
continue;
}
reference_distr_P[i - 1] += outliers_count;
auto reference_distr_bins = reference_distr_P;
std::vector<int> candidate_distr_Q(&hist[0], &hist[i]);
int num_merged_bins = i / num_quantized_bins;
std::vector<int> candidate_distr_Q_quantized(num_quantized_bins, 0);
int j_start = 0;
int j_end = num_merged_bins;
for (int idx = 0; idx < num_quantized_bins; idx++) {
candidate_distr_Q_quantized[idx] = std::accumulate(
&candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0);
j_start += num_merged_bins;
j_end += num_merged_bins;
if ((idx + 1) == num_quantized_bins - 1) {
j_end = i;
}
}
candidate_distr_Q =
ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins);
int Q_sum =
std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0);
auto kl_divergence =
SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum);
if (!kl_inited) {
min_kl_divergence = kl_divergence;
min_kl_index = i;
kl_inited = true;
} else if (kl_divergence < min_kl_divergence) {
min_kl_divergence = kl_divergence;
min_kl_index = i;
} else {
}
}
if (min_kl_index == 0) {
while (starting_iter > 0) {
if (hist[starting_iter] == 0) {
starting_iter -= 1;
continue;
} else {
break;
}
}
min_kl_index = starting_iter;
}
LoDTensor scale_tensor;
scale_tensor.Resize({1});
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width);
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned) const {
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
float max_abs = eigen_tensor.abs().maxCoeff();
float min_val = eigen_tensor.minCoeff();
if (is_unsigned)
PADDLE_ENFORCE(
min_val >= 0.0f,
"Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
min_val);
LoDTensor scale_tensor;
scale_tensor.Resize({1});
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
scale_ptr[0] = 1.0 / max_abs;
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned) const {
PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
float min_val = eigen_tensor.minCoeff();
if (is_unsigned)
PADDLE_ENFORCE(
min_val >= 0.0f,
"Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
min_val);
int channels = var_tensor.dims()[0];
LoDTensor scale_tensor;
scale_tensor.Resize({channels});
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
for (int i = 0; i < channels; ++i) {
const auto tensor = var_tensor.Slice(i, i + 1);
ConstEigenVectorArrayMap eigen_tensor{tensor.data<float>(), tensor.numel(),
1};
float max_abs = eigen_tensor.abs().maxCoeff();
scale_ptr[i] = 1.0 / max_abs;
}
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<std::vector<int>, float>
AnalysisPredictor::MkldnnQuantizer::Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val,
size_t num_bins) const {
PADDLE_ENFORCE_GT(num_bins, 0,
"MkldnnQuantizer: To calculate Histogram, num_bins (" +
std::to_string(num_bins) + ") must be positive.");
PADDLE_ENFORCE_GT(
var_tensor.numel(), 0,
"MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
PADDLE_ENFORCE(max_val >= min_val,
"MkldnnQuantizer: To calculate Histogram, max_val (" +
std::to_string(max_val) +
") must be greater or equal"
"to min_val (" +
std::to_string(min_val) + ").");
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
auto bin_width = std::abs(max_val - min_val) / num_bins;
std::vector<int> hist(num_bins);
for (int i = 0; i < eigen_tensor.size(); i++) {
int bin = std::min(
num_bins - 1,
static_cast<size_t>(floor((eigen_tensor[i] - min_val) / bin_width)));
++hist[bin];
}
return std::make_pair(std::move(hist), std::move(bin_width));
}
void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
auto& arg = predictor_.argument_;
if (!arg.scope_valid()) arg.SetScope(new framework::Scope);
arg.SetMainProgramNotOwned(predictor_.inference_program_.get());
auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
arg.SetMainGraph(graph.release());
arg.main_graph().Set(framework::ir::kParamScopeAttr,
new framework::Scope*(arg.scope_ptr()));
auto* builder = predictor_.config_.pass_builder();
builder->SetPasses({
"infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass",
});
if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
auto passes = builder->AllPasses();
predictor_.argument_.SetIrAnalysisPasses(passes);
predictor_.argument_.SetAnalysisPasses(
{"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"});
predictor_.argument_.SetQuantVarScales(scales_);
}
bool AnalysisPredictor::MkldnnQuantizer::Quantize() {
if (!RunWarmup()) return false;
if (!CalculateScales()) return false;
predictor_.PrepareScope(predictor_.scope_);
predictor_.CreateExecutor();
if (!RunQuantizePasses()) return false;
predictor_.PrepareExecutor();
predictor_.PrepareFeedFetch();
return true;
}
bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true,
predictor_.sub_scope_);
PrepareArgument();
auto& arg = predictor_.argument_;
Analyzer().Run(&arg);
PADDLE_ENFORCE(arg.scope_valid());
VLOG(5) << "to prepare executor";
ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
predictor_.inference_program_.reset(
new framework::ProgramDesc(arg.ir_analyzed_program()));
LOG(INFO) << "== optimize 2 end ==";
predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0,
false, predictor_.sub_scope_);
return true;
}
bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
VLOG(3) << "Predictor: run a quantization warmup iteration";
auto warmup_data = qconfig_->warmup_data();
PADDLE_ENFORCE_NOT_NULL(warmup_data,
"Warmup data cannot be NULL in the config.");
PrettyLogH1("--- Running warmup iteration for quantization");
// Run the inference program
std::vector<PaddleTensor> output_slots;
predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size());
return true;
}
float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
std::vector<int> reference_distr_P, int P_sum,
std::vector<int> candidate_distr_Q, int Q_sum) const {
PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
float tmp_sum1 = 0;
float tmp_sum2 = 0;
for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
int p_idx = reference_distr_P[idx];
int q_idx = candidate_distr_Q[idx];
if (p_idx == 0) {
tmp_sum1 += 0;
tmp_sum2 += 0;
} else {
PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
std::to_string(idx) +
" qindex = 0! p_idx = " +
std::to_string(p_idx));
}
tmp_sum1 += p_idx * (log(Q_sum * p_idx));
tmp_sum2 += p_idx * (log(P_sum * q_idx));
}
return (tmp_sum1 - tmp_sum2) / P_sum;
}
} // namespace paddle