cache fc kernel

test=develop
revert-15296-async_double_buffered_py_reader
tensor-tang 6 years ago
parent 6e1ee7fb57
commit a18c0d4242

@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
return; return;
} }
if (relu) { if (relu) {
auto compute = auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>,
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(N); platform::CPUPlace>::Cache()
.At(N);
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
T* dst = Y + i * N; T* dst = Y + i * N;
compute(B, dst, dst, N); compute(B, dst, dst, N);
} }
} else { } else {
auto compute = auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>,
jit::Get<jit::kVAdd, jit::XYZNTuples<T>, platform::CPUPlace>(N); platform::CPUPlace>::Cache()
.At(N);
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif

Loading…
Cancel
Save