Optimize bilinear interpolate op with OpenMP (#17800)

Refactor the code to be OpenMP friendly test=develop
6 years ago · 7cfddf22c8
parent d6d33fd748
commit 7cfddf22c8
1 changed files with 56 additions and 16 deletions
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@ -11,6 +11,7 @@

 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

@ -57,7 +58,17 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
  auto input_t = EigenTensor<T, 4>::From(input);
  auto output_t = EigenTensor<T, 4>::From(*output);
  bool align_flag = (align_mode == 0 && !align_corners);
-  for (int k = 0; k < out_h; k++) {  // loop for images
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int k = 0; k < out_h; k++) {
    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
                         : static_cast<int>(ratio_h * k);
    y_n = (y_n > 0) ? y_n : 0;
@ -65,7 +76,23 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
    float d_n =
        align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
    float d_s = 1.f - d_n;
+    {
+      vy_n[k] = y_n;
+      vy_s[k] = y_s;
+      vd_n[k] = d_n;
+      vd_s[k] = d_s;
+    }
+  }

+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
  for (int l = 0; l < out_w; l++) {
    int x_w = (align_mode == 0 && !align_corners)
                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
@ -75,14 +102,27 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
    float d_w =
        align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
    float d_e = 1.f - d_w;
+    {
+      vx_w[l] = x_w;
+      vx_e[l] = x_e;
+      vd_w[l] = d_w;
+      vd_e[l] = d_e;
+    }
+  }

+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(4)
+#endif
  for (int i = 0; i < n; i++) {          // loop for batches
    for (int j = 0; j < c; j++) {        // loop for channels
+      for (int k = 0; k < out_h; k++) {  // loop for images
+        for (int l = 0; l < out_w; l++) {
          // bilinear interpolation
-          output_t(i, j, k, l) = input_t(i, j, y_n, x_w) * d_s * d_e +
-                                 input_t(i, j, y_s, x_w) * d_n * d_e +
-                                 input_t(i, j, y_n, x_e) * d_s * d_w +
-                                 input_t(i, j, y_s, x_e) * d_n * d_w;
+          T out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
+                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
+                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
+                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
+          output_t(i, j, k, l) = out_t;
        }
      }
    }