Merge pull request #14080 from tensor-tang/refine/jit/crf2

Refine/jit/crf decoding
7 years ago · 3c957af139
parent b3b329255f 64d5b4385e
commit 3c957af139
5 changed files with 310 additions and 219 deletions
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -301,6 +301,7 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
 op_library(crf_decoding_op DEPS jit_kernel)
 op_library(fusion_lstm_op DEPS jit_kernel)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@ -16,6 +16,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    auto emission_dims = emission_weights.dims();
    const size_t seq_len = emission_dims[0];
    const size_t tag_num = emission_dims[1];
    const size_t state_trans_base_idx = 2;
    const T* x = emission_weights.data<T>();
    const T* w = transition_weights.data<T>();
    int64_t* path = decoded_path->data<int64_t>();
@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    Tensor track;
    int* track_value =
        track.mutable_data<int>(emission_dims, platform::CPUPlace());
-
+    const auto& ker = math::jitkernel::KernelPool::Instance()
-#ifdef __AVX__
+                          .template Get<math::jitkernel::CRFDecodeKernel<T>>(
-// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
+                              static_cast<int>(tag_num));
-// 16 elements per iteration. Then it can implement the parallel processing.
+    ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
 // Only optimize for float type.
 #ifdef __AVX512F__
    size_t step_size = 16;
 #else
    size_t step_size = 8;
 #endif
    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
      size_t steps = tag_num / step_size;
      size_t remain = tag_num % step_size;
      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
      // Setup the alpha initial value.
      size_t i_offset = 0;
      for (size_t i = 0; i <= steps; ++i) {
 #ifdef __AVX512F__
        // Declare the variable for the content of weights, input and alpha
        // values.
        __m512 w_content, x_content, alpha_content;
        // Load the relevant data into the variables from un-aligned address.
        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
        alpha_content = _mm512_add_ps(w_content, x_content);
        // Save the alpha value.
        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
                         alpha_content);
 #else
        // Declare the variable for the content of weights, input and alpha
        // values.
        __m256 w_content, x_content, alpha_content;
        // Load the relevant data into the variables from un-aligned address.
        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
        alpha_content = _mm256_add_ps(w_content, x_content);
        // Save the alpha value.
        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
                         alpha_content);
 #endif
        i_offset += step_size;
        if (i == steps - 1) {
          if (remain > 0) {
            i_offset += last_offset;
          } else {
            break;
          }
        }
      }
      // Use the column-major strategy to get the location of maximum score.
      size_t seq_offset = 0;
      for (size_t k = 1; k < seq_len; ++k) {
        size_t j_offset = 0;
        for (size_t j = 0; j <= steps; ++j) {
 #ifdef __AVX512F__
          // Initialize the variables of maximum score and location.
          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
          __m512i max_j = _mm512_setzero_si512();
 #else
          // Initialize the variables of maximum score and location.
          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
          __m256i max_j = _mm256_set1_epi32(0);
 #endif
          // Calculate the offset of transition_weights.
          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
          for (size_t i = 0; i < tag_num; ++i) {
 #ifdef __AVX512F__
            // Initalize the content of alpha variable with related offset.
            __m512 alpha_content =
                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
            // Obtain the content of weights from un-aligned address.
            __m512 w_content =
                _mm512_loadu_ps((const float*)(w + trans_offset));
            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
            // According to the mask value, it update the index of the max_score
            // location.
            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
            // Update the max_score value.
            max_score = _mm512_max_ps(max_score, score_v);
 #else
            // Initalize the content of alpha variable with related offset.
            __m256 alpha_content = _mm256_broadcast_ss(
                (const float*)(alpha_value + seq_offset + i));
            // Obtain the content of weights from un-aligned address.
            __m256 w_content =
                _mm256_loadu_ps((const float*)(w + trans_offset));
            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
 #ifdef __AVX2__
            // According to the mask value, it update the index of the max_score
            // location.
            max_j = _mm256_or_si256(
                _mm256_andnot_si256((__m256i)mask, max_j),
                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
 #else
            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
            // According to the mask value, it update the index of the max_score
            // location.
            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
 #endif
            // Update the max_score value.
            max_score = _mm256_max_ps(max_score, score_v);
 #endif
            trans_offset += tag_num;
          }
 #ifdef __AVX512F__
          // Update the alpha and track values.
          __m512 x_content = _mm512_loadu_ps(
              (const float*)(x + seq_offset + tag_num + j_offset));
          max_score = _mm512_add_ps(max_score, x_content);
          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
                                                    tag_num + j_offset),
                           max_score);
          _mm512_storeu_si512(
              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
                                         j_offset),
              max_j);
 #else
          // Update the alpha and track values.
          __m256 x_content = _mm256_loadu_ps(
              (const float*)(x + seq_offset + tag_num + j_offset));
          max_score = _mm256_add_ps(max_score, x_content);
          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
                                                    tag_num + j_offset),
                           max_score);
          _mm256_storeu_si256(
              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
                                         j_offset),
              max_j);
 #endif
          // Calculate the offset of next step
          j_offset += step_size;
          if (j == steps - 1) {
            if (remain > 0) {
              j_offset += last_offset;
            } else {
              break;
            }
          }
        }
        seq_offset += tag_num;
      }
    } else {
      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
      for (size_t k = 1; k < seq_len; ++k) {
        for (size_t i = 0; i < tag_num; ++i) {
          T max_score = -std::numeric_limits<T>::max();
          int max_j = 0;
          for (size_t j = 0; j < tag_num; ++j) {
            T score = alpha_value[(k - 1) * tag_num + j] +
                      w[(j + state_trans_base_idx) * tag_num + i];
            if (score > max_score) {
              max_score = score;
              max_j = j;
            }
          }
          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
          track_value[k * tag_num + i] = max_j;
        }
      }
    }
 #else
    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
    for (size_t k = 1; k < seq_len; ++k) {
      for (size_t i = 0; i < tag_num; ++i) {
        T max_score = -std::numeric_limits<T>::max();
        int max_j = 0;
        for (size_t j = 0; j < tag_num; ++j) {
          T score = alpha_value[(k - 1) * tag_num + j] +
                    w[(j + state_trans_base_idx) * tag_num + i];
          if (score > max_score) {
            max_score = score;
            max_j = j;
          }
        }
        alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
        track_value[k * tag_num + i] = max_j;
      }
    }
 #endif
    T max_score = -std::numeric_limits<T>::max();
    int max_i = 0;
    for (size_t i = 0; i < tag_num; ++i) {
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@ -76,6 +76,6 @@ endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 cc_library(jit_kernel 
-    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc
+    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
    DEPS cpu_info cblas)
 cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@ -151,6 +151,13 @@ class GRUKernel : public Kernel {
  virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
 };
 template <typename T>
 class CRFDecodeKernel : public Kernel {
 public:
  virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha,
                       int *track) const = 0;
 };
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc