refine jit vmul with all size

7 years ago · 9255119fd9
parent a9c1824131
commit 9255119fd9
2 changed files with 10 additions and 12 deletions
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@ -27,11 +27,7 @@ using namespace platform::jit;  // NOLINT
 bool VMulJitCode::init(int d) {
  // It's not necessary to use avx512 since it would slow down the frequency
  // and this kernel is not compute bound.
-  if (MayIUse(avx)) {
-    return d % 2 == 0;
-  } else {
-    return false;
-  }
+  return MayIUse(avx);
 }

 void VMulJitCode::generate() {
@ -54,16 +50,19 @@ void VMulJitCode::generate() {
    rest -= 4;
  }
  if (rest >= 2) {
-    mov(tmp, qword[param1 + offset]);
-    vmovq(xmm_src1, tmp);
-    mov(tmp, qword[param2 + offset]);
-    vmovq(xmm_src2, tmp);
+    vmovq(xmm_src1, ptr[param1 + offset]);
+    vmovq(xmm_src2, ptr[param2 + offset]);
    vmulps(xmm_dst, xmm_src1, xmm_src2);
-    vmovq(tmp, xmm_dst);
-    mov(ptr[param3 + offset], tmp);
+    vmovq(ptr[param3 + offset], xmm_dst);
    offset += sizeof(float) * 2;
    rest -= 2;
  }
+  if (rest > 0) {
+    vmovss(xmm_src1, ptr[param1 + offset]);
+    vmovss(xmm_src2, ptr[param2 + offset]);
+    vmulss(xmm_dst, xmm_src1, xmm_src2);
+    vmovss(ptr[param3 + offset], xmm_dst);
+  }
  ret();
 }

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@ -43,7 +43,6 @@ class VMulJitCode : public JitCode {
  reg64_t param1{abi_param1};
  reg64_t param2{abi_param2};
  reg64_t param3{abi_param3};
-  reg64_t tmp = rax;

  xmm_t xmm_src1 = xmm_t(0);
  xmm_t xmm_src2 = xmm_t(1);