|
|
|
@ -48,8 +48,14 @@ void RandomVec(const int n, T* a) {
|
|
|
|
|
|
|
|
|
|
constexpr int repeat = 20000;
|
|
|
|
|
|
|
|
|
|
void vmul_ref(const int n, const float* x, const float* y, float* z) {
|
|
|
|
|
for (int i = 0; i < n; ++i) {
|
|
|
|
|
z[i] = x[i] * y[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if defined __AVX__ || defined __AVX2__
|
|
|
|
|
void vmul_intri(const int n, const float* x, const float* y, float* z) {
|
|
|
|
|
void vmul_intri8(const int n, const float* x, const float* y, float* z) {
|
|
|
|
|
__m256 tmpx, tmpy;
|
|
|
|
|
tmpx = _mm256_loadu_ps(x);
|
|
|
|
|
tmpy = _mm256_loadu_ps(y);
|
|
|
|
@ -58,15 +64,15 @@ void vmul_intri(const int n, const float* x, const float* y, float* z) {
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
void vmul_ref(const int n, const float* x, const float* y, float* z) {
|
|
|
|
|
for (int i = 0; i < n; ++i) {
|
|
|
|
|
z[i] = x[i] * y[i];
|
|
|
|
|
}
|
|
|
|
|
#ifdef PADDLE_WITH_MKLML
|
|
|
|
|
void vmul_mkl(const int n, const float* x, const float* y, float* z) {
|
|
|
|
|
paddle::platform::dynload::vsMul(n, x, y, z);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
TEST(JitKernel, vmul) {
|
|
|
|
|
namespace jit = paddle::operators::math::jitkernel;
|
|
|
|
|
for (int d : {7, 8, 15, 16, 30, 256}) {
|
|
|
|
|
for (int d : {7, 8, 15, 16, 30, 256, 512}) {
|
|
|
|
|
std::vector<float> x(d), y(d);
|
|
|
|
|
std::vector<float> zref(d), ztgt(d);
|
|
|
|
|
RandomVec<float>(d, x.data());
|
|
|
|
@ -79,41 +85,44 @@ TEST(JitKernel, vmul) {
|
|
|
|
|
float* ztgt_data = ztgt.data();
|
|
|
|
|
float* zref_data = zref.data();
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_MKLML
|
|
|
|
|
auto s0 = GetCurrentUS();
|
|
|
|
|
auto trefs = GetCurrentUS();
|
|
|
|
|
for (int i = 0; i < repeat; ++i) {
|
|
|
|
|
paddle::platform::dynload::vsMul(d, x_data, y_data, zref_data);
|
|
|
|
|
vmul_ref(d, x_data, y_data, zref_data);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
auto trefe = GetCurrentUS();
|
|
|
|
|
|
|
|
|
|
auto st = GetCurrentUS();
|
|
|
|
|
for (int i = 0; i < repeat; ++i) {
|
|
|
|
|
ker->Compute(d, x_data, y_data, ztgt_data);
|
|
|
|
|
}
|
|
|
|
|
auto mt = GetCurrentUS();
|
|
|
|
|
#ifdef PADDLE_WITH_MKLML
|
|
|
|
|
auto tmkls = GetCurrentUS();
|
|
|
|
|
for (int i = 0; i < repeat; ++i) {
|
|
|
|
|
vmul_ref(d, x_data, y_data, zref_data);
|
|
|
|
|
vmul_mkl(d, x_data, y_data, zref_data);
|
|
|
|
|
}
|
|
|
|
|
auto et = GetCurrentUS();
|
|
|
|
|
auto tmkle = GetCurrentUS();
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined __AVX__ || defined __AVX2__
|
|
|
|
|
if (d == 8) {
|
|
|
|
|
auto si0 = GetCurrentUS();
|
|
|
|
|
for (int i = 0; i < repeat; ++i) {
|
|
|
|
|
vmul_intri(d, x_data, y_data, zref_data);
|
|
|
|
|
vmul_intri8(d, x_data, y_data, zref_data);
|
|
|
|
|
}
|
|
|
|
|
auto si1 = GetCurrentUS();
|
|
|
|
|
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
VLOG(3) << "Vec size " << d << ": refer takes: " << (et - mt) / repeat
|
|
|
|
|
<< " us, tgt takes: " << (mt - st) / repeat
|
|
|
|
|
auto ttgts = GetCurrentUS();
|
|
|
|
|
for (int i = 0; i < repeat; ++i) {
|
|
|
|
|
ker->Compute(d, x_data, y_data, ztgt_data);
|
|
|
|
|
}
|
|
|
|
|
auto ttgte = GetCurrentUS();
|
|
|
|
|
|
|
|
|
|
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
|
|
|
|
|
#ifdef PADDLE_WITH_MKLML
|
|
|
|
|
<< " us, mkl takes: " << (st - s0) / repeat << " us";
|
|
|
|
|
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
|
|
|
|
|
#else
|
|
|
|
|
<< " us";
|
|
|
|
|
<< " us, "
|
|
|
|
|
#endif
|
|
|
|
|
<< "tgt takes: " << (ttgte - ttgts) / repeat;
|
|
|
|
|
for (int i = 0; i < d; ++i) {
|
|
|
|
|
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
|
|
|
|
|
}
|
|
|
|
|