C-API quantization core 2 (#16396)
* C-API quantization core test=develop Co-authored-by: Sylwester Fraczek <sylwester.fraczek@intel.com> * Decouple Quantizer from AnalysisPredictor test=develop * fixes after review test=develop * renamed mkldnn quantize stuff test=develop * remove ifdef from header file test=developmove-code
parent
e41d581304
commit
09dfc7a2aa
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,104 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "paddle/fluid/framework/naive_executor.h"
|
||||
#include "paddle/fluid/inference/analysis/analyzer.h"
|
||||
#include "paddle/fluid/inference/api/analysis_predictor.h"
|
||||
#include "paddle/fluid/inference/api/api_impl.h"
|
||||
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
|
||||
#include "paddle/fluid/inference/api/helper.h"
|
||||
#include "paddle/fluid/inference/api/paddle_inference_api.h"
|
||||
#include "paddle/fluid/string/printf.h"
|
||||
#ifdef PADDLE_WITH_TESTING
|
||||
#include <gtest/gtest.h>
|
||||
#include <gtest/gtest_prod.h>
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/*
|
||||
* Map variable name to tensor of scaling factors scaling it to MAX=1.0.
|
||||
* bool denotes whether quantization of the variable should be done to unsigned
|
||||
* type.
|
||||
*/
|
||||
using VarQuantScale =
|
||||
std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
|
||||
|
||||
class AnalysisPredictor::MkldnnQuantizer {
|
||||
public:
|
||||
explicit MkldnnQuantizer(
|
||||
AnalysisPredictor& predictor, // NOLINT
|
||||
const std::shared_ptr<MkldnnQuantizerConfig>& qconfig)
|
||||
: predictor_(predictor), qconfig_(qconfig) {}
|
||||
|
||||
// Execute full quantization procedure.
|
||||
bool Quantize();
|
||||
|
||||
#if PADDLE_WITH_TESTING
|
||||
friend class MkldnnQuantizerTest;
|
||||
#endif
|
||||
|
||||
private:
|
||||
// Run single warmup iteration
|
||||
bool RunWarmup() const;
|
||||
// Gather data from variables and calculate scales for them.
|
||||
bool CalculateScales();
|
||||
// Calculate a scale for tensor based on ScaleAlgo rules.
|
||||
void CalculateSingleScale(const std::string& op_name,
|
||||
const std::string& conn_name,
|
||||
const std::string& var_name,
|
||||
const framework::LoDTensor& var_tensor,
|
||||
bool is_unsigned);
|
||||
void PrepareArgument() const;
|
||||
bool RunQuantizePasses() const;
|
||||
|
||||
std::vector<int> ExpandQuantizedBins(std::vector<int> quantized_bins,
|
||||
std::vector<int> reference_bins) const;
|
||||
|
||||
// Using the KL-divergence method get the most precise scaling factor.
|
||||
std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
|
||||
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
|
||||
|
||||
std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
|
||||
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
|
||||
|
||||
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
|
||||
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
|
||||
|
||||
// Returns histogram and bin width
|
||||
std::pair<std::vector<int>, float> Histogram(
|
||||
const framework::LoDTensor& var_tensor, float min_val, float max_val,
|
||||
size_t num_bins = 2048) const;
|
||||
|
||||
// Calculate the entropy.
|
||||
float SafeEntropy(std::vector<int> reference_distr_P, int P_sum,
|
||||
std::vector<int> candidate_distr_Q, int Q_sum) const;
|
||||
|
||||
private:
|
||||
AnalysisPredictor& predictor_;
|
||||
const std::shared_ptr<MkldnnQuantizerConfig> qconfig_;
|
||||
|
||||
// A map: variable name -> scale
|
||||
VarQuantScale scales_;
|
||||
};
|
||||
|
||||
} // namespace paddle
|
@ -0,0 +1,40 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
|
||||
// The default configuration of scale computing algorightms
|
||||
rules_["conv2d"]["Input"] = ScaleAlgo::KL;
|
||||
rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH;
|
||||
rules_["conv2d"]["Bias"] = ScaleAlgo::NONE; // do not compute scale
|
||||
rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL;
|
||||
rules_["conv2d"]["Output"] = ScaleAlgo::KL; // do not compute scale
|
||||
|
||||
rules_["pool2d"]["X"] = ScaleAlgo::KL;
|
||||
rules_["pool2d"]["Out"] = ScaleAlgo::KL; // do not compute scale
|
||||
}
|
||||
|
||||
ScaleAlgo MkldnnQuantizerConfig::scale_algo(
|
||||
const std::string& op_type_name, const std::string& conn_name) const {
|
||||
if (rules_.find(op_type_name) != rules_.end()) {
|
||||
auto op_rule = rules_.at(op_type_name);
|
||||
if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name);
|
||||
}
|
||||
return default_scale_algo_;
|
||||
}
|
||||
|
||||
} // namespace paddle
|
@ -0,0 +1,105 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle_api.h" // NOLINT
|
||||
|
||||
namespace paddle {
|
||||
|
||||
// Algorithms for finding scale of quantized Tensors.
|
||||
enum class ScaleAlgo {
|
||||
NONE, // Do not compute scale
|
||||
MAX, // Find scale based on the maximum absolute value
|
||||
MAX_CH, // Find scale based on the maximum absolute value per channel
|
||||
KL, // Find scale based on KL Divergence
|
||||
};
|
||||
|
||||
struct MkldnnQuantizerConfig {
|
||||
MkldnnQuantizerConfig();
|
||||
|
||||
/** Specify a quantization algorithm for a connection (input/output) of the
|
||||
* operator type.
|
||||
* @param op_type_name the operator's name.
|
||||
* @param conn_name name of the connection (input/output) of the operator.
|
||||
* @param algo the algorithm for computing scale.
|
||||
*/
|
||||
void SetScaleAlgo(std::string op_type_name, std::string conn_name,
|
||||
ScaleAlgo algo) {
|
||||
rules_[op_type_name][conn_name] = algo;
|
||||
}
|
||||
|
||||
/** Get the quantization algorithm for a connection (input/output) of the
|
||||
* operator type.
|
||||
* @param op_type_name the operator's name.
|
||||
* @param conn_name name of the connection (input/output) of the operator.
|
||||
* @return the algorithm for computing scale.
|
||||
*/
|
||||
ScaleAlgo scale_algo(const std::string& op_type_name,
|
||||
const std::string& conn_name) const;
|
||||
|
||||
/** Set the batch of data to be used for warm-up iteration.
|
||||
* @param data batch of data.
|
||||
*/
|
||||
void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) {
|
||||
warmup_data_ = data;
|
||||
}
|
||||
|
||||
/** Get the batch of data used for warm-up iteration.
|
||||
* @return batch of data.
|
||||
*/
|
||||
std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const {
|
||||
return warmup_data_;
|
||||
}
|
||||
|
||||
void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; }
|
||||
|
||||
int warmup_batch_size() const { return warmup_bs_; }
|
||||
|
||||
void SetEnabledOpTypes(std::unordered_set<std::string> op_list) {
|
||||
enabled_op_types_ = op_list;
|
||||
}
|
||||
|
||||
const std::unordered_set<std::string>& enabled_op_types() const {
|
||||
return enabled_op_types_;
|
||||
}
|
||||
|
||||
void SetExcludedOpIds(std::unordered_set<int> op_ids_list) {
|
||||
excluded_op_ids_ = op_ids_list;
|
||||
}
|
||||
|
||||
const std::unordered_set<int>& excluded_op_ids() const {
|
||||
return excluded_op_ids_;
|
||||
}
|
||||
|
||||
void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; }
|
||||
|
||||
ScaleAlgo default_scale_algo() const { return default_scale_algo_; }
|
||||
|
||||
protected:
|
||||
std::map<std::string, std::map<std::string, ScaleAlgo>> rules_;
|
||||
std::unordered_set<std::string> enabled_op_types_;
|
||||
std::unordered_set<int> excluded_op_ids_;
|
||||
std::shared_ptr<std::vector<PaddleTensor>> warmup_data_;
|
||||
int warmup_bs_{1};
|
||||
ScaleAlgo default_scale_algo_{ScaleAlgo::MAX};
|
||||
};
|
||||
|
||||
} // namespace paddle
|
Loading…
Reference in new issue