|
|
|
@ -22,10 +22,54 @@
|
|
|
|
|
#include "paddle/fluid/platform/device_tracer.h"
|
|
|
|
|
#include "paddle/fluid/platform/place.h"
|
|
|
|
|
#include "paddle/fluid/platform/port.h"
|
|
|
|
|
#include "paddle/fluid/platform/variant.h" // for UNUSED
|
|
|
|
|
|
|
|
|
|
DEFINE_int32(burning, 10, "Burning times.");
|
|
|
|
|
DEFINE_int32(repeat, 3000, "Repeat times.");
|
|
|
|
|
DEFINE_int32(max_size, 1000, "The Max size would be tested.");
|
|
|
|
|
DEFINE_string(filter, "", "The Benchmark name would be run.");
|
|
|
|
|
|
|
|
|
|
class BenchJITKernel {
|
|
|
|
|
public:
|
|
|
|
|
BenchJITKernel() = default;
|
|
|
|
|
virtual ~BenchJITKernel() = default;
|
|
|
|
|
virtual void Run() = 0;
|
|
|
|
|
virtual const char* Name() = 0;
|
|
|
|
|
virtual const char* Dtype() = 0;
|
|
|
|
|
virtual const char* Place() = 0;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static std::vector<BenchJITKernel*> g_all_benchmarks;
|
|
|
|
|
|
|
|
|
|
BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
|
|
|
|
|
g_all_benchmarks.push_back(b);
|
|
|
|
|
return b;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define BENCH_JITKERNEL(name, dtype, place) \
|
|
|
|
|
class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \
|
|
|
|
|
public: \
|
|
|
|
|
const char* Name() override { return #name; } \
|
|
|
|
|
const char* Dtype() override { return #dtype; } \
|
|
|
|
|
const char* Place() override { return #place; } \
|
|
|
|
|
void Run() override; \
|
|
|
|
|
}; \
|
|
|
|
|
static auto inserted_##name##_##dtype##_##place##_ UNUSED = \
|
|
|
|
|
InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \
|
|
|
|
|
void BenchJITKernel_##name##_##dtype##_##place##_::Run()
|
|
|
|
|
|
|
|
|
|
#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
|
|
|
|
|
|
|
|
|
|
void RUN_ALL_BENCHMARK() {
|
|
|
|
|
for (auto p : g_all_benchmarks) {
|
|
|
|
|
if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "."
|
|
|
|
|
<< p->Place();
|
|
|
|
|
p->Run();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
|
|
|
|
@ -228,49 +272,70 @@ void BenchMatMulKernel() {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
using T = float;
|
|
|
|
|
using PlaceType = paddle::platform::CPUPlace;
|
|
|
|
|
|
|
|
|
|
// xyzn
|
|
|
|
|
BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
// axyn
|
|
|
|
|
BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
// xyn
|
|
|
|
|
BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
// lstm and peephole
|
|
|
|
|
BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
// gru functions
|
|
|
|
|
BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kGRUHtPart1) {
|
|
|
|
|
BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BENCH_FP32_CPU(kGRUHtPart2) {
|
|
|
|
|
BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// seq pool function
|
|
|
|
|
BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
// matmul
|
|
|
|
|
BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, PlaceType>(); }
|
|
|
|
|
|
|
|
|
|
// Benchmark all jit kernels including jitcode, mkl and refer.
|
|
|
|
|
// To use this tool, run command: ./benchmark [options...]
|
|
|
|
|
// Options:
|
|
|
|
|
// --burning: the burning time before count
|
|
|
|
|
// --repeat: the repeat times
|
|
|
|
|
// --max_size: the max size would be tested
|
|
|
|
|
// --filter: the bench name would be run
|
|
|
|
|
int main(int argc, char* argv[]) {
|
|
|
|
|
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
|
|
|
|
google::InitGoogleLogging(argv[0]);
|
|
|
|
|
LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
|
|
|
|
|
<< " times.";
|
|
|
|
|
using T = float;
|
|
|
|
|
using PlaceType = paddle::platform::CPUPlace;
|
|
|
|
|
// xyzn
|
|
|
|
|
BenchXYZNKernel<jit::kVMul, T, PlaceType>();
|
|
|
|
|
BenchXYZNKernel<jit::kVAdd, T, PlaceType>();
|
|
|
|
|
BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>();
|
|
|
|
|
BenchXYZNKernel<jit::kVSub, T, PlaceType>();
|
|
|
|
|
|
|
|
|
|
// axyn
|
|
|
|
|
BenchAXYNKernel<jit::kVScal, T, PlaceType>();
|
|
|
|
|
BenchAXYNKernel<jit::kVAddBias, T, PlaceType>();
|
|
|
|
|
|
|
|
|
|
// xyn
|
|
|
|
|
BenchXYNKernel<jit::kVRelu, T, PlaceType>();
|
|
|
|
|
BenchXYNKernel<jit::kVIdentity, T, PlaceType>();
|
|
|
|
|
BenchXYNKernel<jit::kVSquare, T, PlaceType>();
|
|
|
|
|
BenchXYNKernel<jit::kVExp, T, PlaceType>();
|
|
|
|
|
BenchXYNKernel<jit::kVSigmoid, T, PlaceType>();
|
|
|
|
|
BenchXYNKernel<jit::kVTanh, T, PlaceType>();
|
|
|
|
|
|
|
|
|
|
// lstm and peephole
|
|
|
|
|
BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>();
|
|
|
|
|
BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>();
|
|
|
|
|
|
|
|
|
|
// gru functions
|
|
|
|
|
BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
|
|
|
|
|
BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
|
|
|
|
|
BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
|
|
|
|
|
|
|
|
|
|
// seq pool function
|
|
|
|
|
BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
|
|
|
|
|
|
|
|
|
|
// matmul
|
|
|
|
|
BenchMatMulKernel<jit::kMatMul, T, PlaceType>();
|
|
|
|
|
RUN_ALL_BENCHMARK();
|
|
|
|
|
}
|
|
|
|
|