|
|
@ -16,6 +16,7 @@ limitations under the License. */
|
|
|
|
#include <limits>
|
|
|
|
#include <limits>
|
|
|
|
#include "paddle/fluid/framework/eigen.h"
|
|
|
|
#include "paddle/fluid/framework/eigen.h"
|
|
|
|
#include "paddle/fluid/framework/op_registry.h"
|
|
|
|
#include "paddle/fluid/framework/op_registry.h"
|
|
|
|
|
|
|
|
#include "paddle/fluid/operators/math/jit_kernel.h"
|
|
|
|
#include "paddle/fluid/operators/math/math_function.h"
|
|
|
|
#include "paddle/fluid/operators/math/math_function.h"
|
|
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
namespace paddle {
|
|
|
@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
|
|
|
|
auto emission_dims = emission_weights.dims();
|
|
|
|
auto emission_dims = emission_weights.dims();
|
|
|
|
const size_t seq_len = emission_dims[0];
|
|
|
|
const size_t seq_len = emission_dims[0];
|
|
|
|
const size_t tag_num = emission_dims[1];
|
|
|
|
const size_t tag_num = emission_dims[1];
|
|
|
|
|
|
|
|
|
|
|
|
const size_t state_trans_base_idx = 2;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const T* x = emission_weights.data<T>();
|
|
|
|
const T* x = emission_weights.data<T>();
|
|
|
|
const T* w = transition_weights.data<T>();
|
|
|
|
const T* w = transition_weights.data<T>();
|
|
|
|
int64_t* path = decoded_path->data<int64_t>();
|
|
|
|
int64_t* path = decoded_path->data<int64_t>();
|
|
|
@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
|
|
|
|
Tensor track;
|
|
|
|
Tensor track;
|
|
|
|
int* track_value =
|
|
|
|
int* track_value =
|
|
|
|
track.mutable_data<int>(emission_dims, platform::CPUPlace());
|
|
|
|
track.mutable_data<int>(emission_dims, platform::CPUPlace());
|
|
|
|
|
|
|
|
const auto& ker = math::jitkernel::KernelPool::Instance()
|
|
|
|
#ifdef __AVX__
|
|
|
|
.template Get<math::jitkernel::CRFDecodeKernel<T>>(
|
|
|
|
// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
|
|
|
|
static_cast<int>(tag_num));
|
|
|
|
// 16 elements per iteration. Then it can implement the parallel processing.
|
|
|
|
ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
|
|
|
|
// Only optimize for float type.
|
|
|
|
|
|
|
|
#ifdef __AVX512F__
|
|
|
|
|
|
|
|
size_t step_size = 16;
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
size_t step_size = 8;
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (std::is_same<T, float>::value && (tag_num >= step_size)) {
|
|
|
|
|
|
|
|
size_t steps = tag_num / step_size;
|
|
|
|
|
|
|
|
size_t remain = tag_num % step_size;
|
|
|
|
|
|
|
|
int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Setup the alpha initial value.
|
|
|
|
|
|
|
|
size_t i_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i <= steps; ++i) {
|
|
|
|
|
|
|
|
#ifdef __AVX512F__
|
|
|
|
|
|
|
|
// Declare the variable for the content of weights, input and alpha
|
|
|
|
|
|
|
|
// values.
|
|
|
|
|
|
|
|
__m512 w_content, x_content, alpha_content;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Load the relevant data into the variables from un-aligned address.
|
|
|
|
|
|
|
|
w_content = _mm512_loadu_ps((const float*)(w + i_offset));
|
|
|
|
|
|
|
|
x_content = _mm512_loadu_ps((const float*)(x + i_offset));
|
|
|
|
|
|
|
|
alpha_content = _mm512_add_ps(w_content, x_content);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Save the alpha value.
|
|
|
|
|
|
|
|
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
|
|
|
|
|
|
|
|
alpha_content);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
// Declare the variable for the content of weights, input and alpha
|
|
|
|
|
|
|
|
// values.
|
|
|
|
|
|
|
|
__m256 w_content, x_content, alpha_content;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Load the relevant data into the variables from un-aligned address.
|
|
|
|
|
|
|
|
w_content = _mm256_loadu_ps((const float*)(w + i_offset));
|
|
|
|
|
|
|
|
x_content = _mm256_loadu_ps((const float*)(x + i_offset));
|
|
|
|
|
|
|
|
alpha_content = _mm256_add_ps(w_content, x_content);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Save the alpha value.
|
|
|
|
|
|
|
|
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
|
|
|
|
|
|
|
|
alpha_content);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
i_offset += step_size;
|
|
|
|
|
|
|
|
if (i == steps - 1) {
|
|
|
|
|
|
|
|
if (remain > 0) {
|
|
|
|
|
|
|
|
i_offset += last_offset;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Use the column-major strategy to get the location of maximum score.
|
|
|
|
|
|
|
|
size_t seq_offset = 0;
|
|
|
|
|
|
|
|
for (size_t k = 1; k < seq_len; ++k) {
|
|
|
|
|
|
|
|
size_t j_offset = 0;
|
|
|
|
|
|
|
|
for (size_t j = 0; j <= steps; ++j) {
|
|
|
|
|
|
|
|
#ifdef __AVX512F__
|
|
|
|
|
|
|
|
// Initialize the variables of maximum score and location.
|
|
|
|
|
|
|
|
__m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
|
|
|
|
|
|
|
|
__m512i max_j = _mm512_setzero_si512();
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
// Initialize the variables of maximum score and location.
|
|
|
|
|
|
|
|
__m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
|
|
|
|
|
|
|
|
__m256i max_j = _mm256_set1_epi32(0);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Calculate the offset of transition_weights.
|
|
|
|
|
|
|
|
size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < tag_num; ++i) {
|
|
|
|
|
|
|
|
#ifdef __AVX512F__
|
|
|
|
|
|
|
|
// Initalize the content of alpha variable with related offset.
|
|
|
|
|
|
|
|
__m512 alpha_content =
|
|
|
|
|
|
|
|
_mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
|
|
|
|
|
|
|
|
// Obtain the content of weights from un-aligned address.
|
|
|
|
|
|
|
|
__m512 w_content =
|
|
|
|
|
|
|
|
_mm512_loadu_ps((const float*)(w + trans_offset));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__m512 score_v = _mm512_add_ps(alpha_content, w_content);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// According to the mask value, it update the index of the max_score
|
|
|
|
|
|
|
|
// location.
|
|
|
|
|
|
|
|
max_j = _mm512_mask_set1_epi32(max_j, mask, i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update the max_score value.
|
|
|
|
|
|
|
|
max_score = _mm512_max_ps(max_score, score_v);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
// Initalize the content of alpha variable with related offset.
|
|
|
|
|
|
|
|
__m256 alpha_content = _mm256_broadcast_ss(
|
|
|
|
|
|
|
|
(const float*)(alpha_value + seq_offset + i));
|
|
|
|
|
|
|
|
// Obtain the content of weights from un-aligned address.
|
|
|
|
|
|
|
|
__m256 w_content =
|
|
|
|
|
|
|
|
_mm256_loadu_ps((const float*)(w + trans_offset));
|
|
|
|
|
|
|
|
__m256 score_v = _mm256_add_ps(alpha_content, w_content);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef __AVX2__
|
|
|
|
|
|
|
|
// According to the mask value, it update the index of the max_score
|
|
|
|
|
|
|
|
// location.
|
|
|
|
|
|
|
|
max_j = _mm256_or_si256(
|
|
|
|
|
|
|
|
_mm256_andnot_si256((__m256i)mask, max_j),
|
|
|
|
|
|
|
|
_mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
|
|
|
|
|
|
|
|
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
|
|
|
|
|
|
|
|
__m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
|
|
|
|
|
|
|
|
__m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
|
|
|
|
|
|
|
|
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
|
|
|
|
|
|
|
|
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
|
|
|
|
|
|
|
|
hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
|
|
|
|
|
|
|
|
hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// According to the mask value, it update the index of the max_score
|
|
|
|
|
|
|
|
// location.
|
|
|
|
|
|
|
|
max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
|
|
|
|
|
|
|
|
max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update the max_score value.
|
|
|
|
|
|
|
|
max_score = _mm256_max_ps(max_score, score_v);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
trans_offset += tag_num;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef __AVX512F__
|
|
|
|
|
|
|
|
// Update the alpha and track values.
|
|
|
|
|
|
|
|
__m512 x_content = _mm512_loadu_ps(
|
|
|
|
|
|
|
|
(const float*)(x + seq_offset + tag_num + j_offset));
|
|
|
|
|
|
|
|
max_score = _mm512_add_ps(max_score, x_content);
|
|
|
|
|
|
|
|
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
|
|
|
|
|
|
|
|
tag_num + j_offset),
|
|
|
|
|
|
|
|
max_score);
|
|
|
|
|
|
|
|
_mm512_storeu_si512(
|
|
|
|
|
|
|
|
reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
|
|
|
|
|
|
|
|
j_offset),
|
|
|
|
|
|
|
|
max_j);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
// Update the alpha and track values.
|
|
|
|
|
|
|
|
__m256 x_content = _mm256_loadu_ps(
|
|
|
|
|
|
|
|
(const float*)(x + seq_offset + tag_num + j_offset));
|
|
|
|
|
|
|
|
max_score = _mm256_add_ps(max_score, x_content);
|
|
|
|
|
|
|
|
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
|
|
|
|
|
|
|
|
tag_num + j_offset),
|
|
|
|
|
|
|
|
max_score);
|
|
|
|
|
|
|
|
_mm256_storeu_si256(
|
|
|
|
|
|
|
|
reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
|
|
|
|
|
|
|
|
j_offset),
|
|
|
|
|
|
|
|
max_j);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Calculate the offset of next step
|
|
|
|
|
|
|
|
j_offset += step_size;
|
|
|
|
|
|
|
|
if (j == steps - 1) {
|
|
|
|
|
|
|
|
if (remain > 0) {
|
|
|
|
|
|
|
|
j_offset += last_offset;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seq_offset += tag_num;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (size_t k = 1; k < seq_len; ++k) {
|
|
|
|
|
|
|
|
for (size_t i = 0; i < tag_num; ++i) {
|
|
|
|
|
|
|
|
T max_score = -std::numeric_limits<T>::max();
|
|
|
|
|
|
|
|
int max_j = 0;
|
|
|
|
|
|
|
|
for (size_t j = 0; j < tag_num; ++j) {
|
|
|
|
|
|
|
|
T score = alpha_value[(k - 1) * tag_num + j] +
|
|
|
|
|
|
|
|
w[(j + state_trans_base_idx) * tag_num + i];
|
|
|
|
|
|
|
|
if (score > max_score) {
|
|
|
|
|
|
|
|
max_score = score;
|
|
|
|
|
|
|
|
max_j = j;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
|
|
|
|
|
|
|
|
track_value[k * tag_num + i] = max_j;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (size_t k = 1; k < seq_len; ++k) {
|
|
|
|
|
|
|
|
for (size_t i = 0; i < tag_num; ++i) {
|
|
|
|
|
|
|
|
T max_score = -std::numeric_limits<T>::max();
|
|
|
|
|
|
|
|
int max_j = 0;
|
|
|
|
|
|
|
|
for (size_t j = 0; j < tag_num; ++j) {
|
|
|
|
|
|
|
|
T score = alpha_value[(k - 1) * tag_num + j] +
|
|
|
|
|
|
|
|
w[(j + state_trans_base_idx) * tag_num + i];
|
|
|
|
|
|
|
|
if (score > max_score) {
|
|
|
|
|
|
|
|
max_score = score;
|
|
|
|
|
|
|
|
max_j = j;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
|
|
|
|
|
|
|
|
track_value[k * tag_num + i] = max_j;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
T max_score = -std::numeric_limits<T>::max();
|
|
|
|
T max_score = -std::numeric_limits<T>::max();
|
|
|
|
int max_i = 0;
|
|
|
|
int max_i = 0;
|
|
|
|
for (size_t i = 0; i < tag_num; ++i) {
|
|
|
|
for (size_t i = 0; i < tag_num; ++i) {
|
|
|
|