|
|
|
@ -17,10 +17,12 @@
|
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
#include <functional>
|
|
|
|
|
#include <memory>
|
|
|
|
|
#include <string>
|
|
|
|
|
#include <thread> // NOLINT
|
|
|
|
|
#include <unordered_map>
|
|
|
|
|
#include <utility>
|
|
|
|
|
#include <vector>
|
|
|
|
|
#ifdef WITH_GPERFTOOLS
|
|
|
|
|
#include <gperftools/profiler.h>
|
|
|
|
@ -48,6 +50,7 @@ DEFINE_bool(ernie_large, false, "Test ernie large");
|
|
|
|
|
DEFINE_bool(with_accuracy_layer, true,
|
|
|
|
|
"Calculate the accuracy while label is in the input");
|
|
|
|
|
DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
|
|
|
|
|
DEFINE_bool(enable_bf16, true, "Enable BF16 type prediction");
|
|
|
|
|
DEFINE_bool(enable_int8, true, "Enable INT8 type prediction");
|
|
|
|
|
DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
|
|
|
|
|
// setting iterations to 0 means processing the whole dataset
|
|
|
|
@ -124,6 +127,123 @@ class Barrier {
|
|
|
|
|
std::size_t _count;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
class TensorReader {
|
|
|
|
|
public:
|
|
|
|
|
TensorReader(std::ifstream &file, size_t beginning_offset,
|
|
|
|
|
std::vector<int> shape, std::string name)
|
|
|
|
|
: file_(file), position_(beginning_offset), shape_(shape), name_(name) {
|
|
|
|
|
numel_ = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
|
|
|
|
|
std::multiplies<size_t>());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PaddleTensor NextBatch() {
|
|
|
|
|
PaddleTensor tensor;
|
|
|
|
|
tensor.name = name_;
|
|
|
|
|
tensor.shape = shape_;
|
|
|
|
|
tensor.dtype = GetPaddleDType<T>();
|
|
|
|
|
tensor.data.Resize(numel_ * sizeof(T));
|
|
|
|
|
|
|
|
|
|
file_.seekg(position_);
|
|
|
|
|
file_.read(static_cast<char *>(tensor.data.data()), numel_ * sizeof(T));
|
|
|
|
|
position_ = file_.tellg();
|
|
|
|
|
|
|
|
|
|
if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
|
|
|
|
|
if (file_.fail())
|
|
|
|
|
throw std::runtime_error(name_ + ": failed reading file.");
|
|
|
|
|
|
|
|
|
|
return tensor;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
std::ifstream &file_;
|
|
|
|
|
size_t position_;
|
|
|
|
|
std::vector<int> shape_;
|
|
|
|
|
std::string name_;
|
|
|
|
|
size_t numel_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
|
|
|
|
|
const std::vector<std::vector<PaddleTensor>> &test_data,
|
|
|
|
|
int num_images = FLAGS_warmup_batch_size) {
|
|
|
|
|
int test_data_batch_size = test_data[0][0].shape[0];
|
|
|
|
|
auto iterations = test_data.size();
|
|
|
|
|
auto all_test_data_size = iterations * test_data_batch_size;
|
|
|
|
|
PADDLE_ENFORCE_LE(static_cast<size_t>(num_images), all_test_data_size,
|
|
|
|
|
platform::errors::InvalidArgument(
|
|
|
|
|
"The requested quantization warmup data size must be "
|
|
|
|
|
"lower or equal to the test data size. But received"
|
|
|
|
|
"warmup size is %d and test data size is %d. Please "
|
|
|
|
|
"use --warmup_batch_size parameter to set smaller "
|
|
|
|
|
"warmup batch size.",
|
|
|
|
|
num_images, all_test_data_size));
|
|
|
|
|
|
|
|
|
|
PaddleTensor images;
|
|
|
|
|
images.name = "image";
|
|
|
|
|
images.shape = {num_images, 3, 224, 224};
|
|
|
|
|
images.dtype = PaddleDType::FLOAT32;
|
|
|
|
|
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
|
|
|
|
|
|
|
|
|
|
PaddleTensor labels;
|
|
|
|
|
labels.name = "label";
|
|
|
|
|
labels.shape = {num_images, 1};
|
|
|
|
|
labels.dtype = PaddleDType::INT64;
|
|
|
|
|
labels.data.Resize(sizeof(int64_t) * num_images);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < num_images; i++) {
|
|
|
|
|
auto batch = i / test_data_batch_size;
|
|
|
|
|
auto element_in_batch = i % test_data_batch_size;
|
|
|
|
|
std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
|
|
|
|
|
element_in_batch * 3 * 224 * 224,
|
|
|
|
|
3 * 224 * 224,
|
|
|
|
|
static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
|
|
|
|
|
|
|
|
|
|
std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
|
|
|
|
|
element_in_batch,
|
|
|
|
|
1, static_cast<int64_t *>(labels.data.data()) + i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
|
|
|
|
|
(*warmup_data)[0] = std::move(images);
|
|
|
|
|
(*warmup_data)[1] = std::move(labels);
|
|
|
|
|
return warmup_data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void SetInputs(std::vector<std::vector<PaddleTensor>> *inputs,
|
|
|
|
|
int32_t batch_size = FLAGS_batch_size) {
|
|
|
|
|
std::ifstream file(FLAGS_infer_data, std::ios::binary);
|
|
|
|
|
if (!file) {
|
|
|
|
|
FAIL() << "Couldn't open file: " << FLAGS_infer_data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int64_t total_images{0};
|
|
|
|
|
file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
|
|
|
|
|
LOG(INFO) << "Total images in file: " << total_images;
|
|
|
|
|
|
|
|
|
|
std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
|
|
|
|
|
std::vector<int> label_batch_shape{batch_size, 1};
|
|
|
|
|
auto images_offset_in_file = static_cast<size_t>(file.tellg());
|
|
|
|
|
auto labels_offset_in_file =
|
|
|
|
|
images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
|
|
|
|
|
|
|
|
|
|
TensorReader<float> image_reader(file, images_offset_in_file,
|
|
|
|
|
image_batch_shape, "image");
|
|
|
|
|
TensorReader<int64_t> label_reader(file, labels_offset_in_file,
|
|
|
|
|
label_batch_shape, "label");
|
|
|
|
|
|
|
|
|
|
auto iterations_max = total_images / batch_size;
|
|
|
|
|
auto iterations = iterations_max;
|
|
|
|
|
if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
|
|
|
|
|
iterations = FLAGS_iterations;
|
|
|
|
|
}
|
|
|
|
|
for (auto i = 0; i < iterations; i++) {
|
|
|
|
|
auto images = image_reader.NextBatch();
|
|
|
|
|
auto labels = label_reader.NextBatch();
|
|
|
|
|
inputs->emplace_back(
|
|
|
|
|
std::vector<PaddleTensor>{std::move(images), std::move(labels)});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compare result between two PaddleTensor
|
|
|
|
|
void CompareResult(const std::vector<PaddleTensor> &outputs,
|
|
|
|
|
const std::vector<PaddleTensor> &ref_outputs) {
|
|
|
|
@ -555,10 +675,10 @@ void SummarizePerformance(const char *title, float sample) {
|
|
|
|
|
<< " ms";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void SummarizePerformance(float sample_latency_fp32,
|
|
|
|
|
float sample_latency_int8) {
|
|
|
|
|
if (FLAGS_enable_fp32) SummarizePerformance("FP32", sample_latency_fp32);
|
|
|
|
|
if (FLAGS_enable_int8) SummarizePerformance("INT8", sample_latency_int8);
|
|
|
|
|
void SummarizePerformance(const char *title_fp32, float sample_latency_fp32,
|
|
|
|
|
const char *title, float sample_latency) {
|
|
|
|
|
SummarizePerformance(title_fp32, sample_latency_fp32);
|
|
|
|
|
SummarizePerformance(title, sample_latency);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
float CompareAccuracyOne(
|
|
|
|
@ -708,11 +828,51 @@ void CompareQuantizedAndAnalysis(
|
|
|
|
|
TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true,
|
|
|
|
|
VarType::INT8, &sample_latency_int8);
|
|
|
|
|
}
|
|
|
|
|
SummarizePerformance(sample_latency_fp32, sample_latency_int8);
|
|
|
|
|
SummarizePerformance("FP32", sample_latency_fp32, "INT8",
|
|
|
|
|
sample_latency_int8);
|
|
|
|
|
|
|
|
|
|
CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CompareBFloat16AndAnalysis(
|
|
|
|
|
const AnalysisConfig *config, const AnalysisConfig *qconfig,
|
|
|
|
|
const std::vector<std::vector<PaddleTensor>> &inputs,
|
|
|
|
|
const int compared_idx = 1) {
|
|
|
|
|
PADDLE_ENFORCE_EQ(
|
|
|
|
|
inputs[0][0].shape[0], FLAGS_batch_size,
|
|
|
|
|
platform::errors::InvalidArgument(
|
|
|
|
|
"Input data has to be packed batch by batch. The batchsize is set to "
|
|
|
|
|
"%d, but the real input is packed with batchsize = %d",
|
|
|
|
|
FLAGS_batch_size, inputs[0][0].shape[0]));
|
|
|
|
|
LOG(INFO) << "FP32 & BF16 prediction run: batch_size " << FLAGS_batch_size;
|
|
|
|
|
|
|
|
|
|
LOG(INFO) << "--- FP32 prediction start ---";
|
|
|
|
|
auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
|
|
|
|
|
PrintConfig(cfg, true);
|
|
|
|
|
std::vector<std::vector<PaddleTensor>> analysis_outputs;
|
|
|
|
|
float sample_latency_fp32{-1};
|
|
|
|
|
|
|
|
|
|
if (FLAGS_enable_fp32) {
|
|
|
|
|
TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32,
|
|
|
|
|
&sample_latency_fp32);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
LOG(INFO) << "--- BF16 prediction start ---";
|
|
|
|
|
auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
|
|
|
|
|
PrintConfig(qcfg, true);
|
|
|
|
|
std::vector<std::vector<PaddleTensor>> bf16_outputs;
|
|
|
|
|
float sample_latency_bf16{-1};
|
|
|
|
|
|
|
|
|
|
if (FLAGS_enable_bf16) {
|
|
|
|
|
TestOneThreadPrediction(qcfg, inputs, &bf16_outputs, true, VarType::FP32,
|
|
|
|
|
&sample_latency_bf16);
|
|
|
|
|
}
|
|
|
|
|
SummarizePerformance("FP32", sample_latency_fp32, "BF16",
|
|
|
|
|
sample_latency_bf16);
|
|
|
|
|
|
|
|
|
|
CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CompareAnalysisAndAnalysis(
|
|
|
|
|
const AnalysisConfig *config1, const AnalysisConfig *config2,
|
|
|
|
|
const std::vector<std::vector<PaddleTensor>> &inputs,
|
|
|
|
@ -749,7 +909,8 @@ void CompareAnalysisAndAnalysis(
|
|
|
|
|
TestOneThreadPrediction(cfg2, inputs, &int8_outputs, true, VarType::INT8,
|
|
|
|
|
&sample_latency_int8);
|
|
|
|
|
}
|
|
|
|
|
SummarizePerformance(sample_latency_fp32, sample_latency_int8);
|
|
|
|
|
SummarizePerformance("FP32", sample_latency_fp32, "INT8",
|
|
|
|
|
sample_latency_int8);
|
|
|
|
|
if (with_accuracy_layer) {
|
|
|
|
|
CompareAccuracy(int8_outputs, analysis_outputs, compared_idx);
|
|
|
|
|
}
|
|
|
|
|