Merge pull request #14080 from tensor-tang/refine/jit/crf2

Refine/jit/crf decoding
fix_recordio_link
tensor-tang 6 years ago committed by GitHub
commit 3c957af139
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -301,6 +301,7 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory)
op_library(crf_decoding_op DEPS jit_kernel)
op_library(fusion_lstm_op DEPS jit_kernel)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)

@ -16,6 +16,7 @@ limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
auto emission_dims = emission_weights.dims();
const size_t seq_len = emission_dims[0];
const size_t tag_num = emission_dims[1];
const size_t state_trans_base_idx = 2;
const T* x = emission_weights.data<T>();
const T* w = transition_weights.data<T>();
int64_t* path = decoded_path->data<int64_t>();
@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
Tensor track;
int* track_value =
track.mutable_data<int>(emission_dims, platform::CPUPlace());
#ifdef __AVX__
// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
// 16 elements per iteration. Then it can implement the parallel processing.
// Only optimize for float type.
#ifdef __AVX512F__
size_t step_size = 16;
#else
size_t step_size = 8;
#endif
if (std::is_same<T, float>::value && (tag_num >= step_size)) {
size_t steps = tag_num / step_size;
size_t remain = tag_num % step_size;
int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
// Setup the alpha initial value.
size_t i_offset = 0;
for (size_t i = 0; i <= steps; ++i) {
#ifdef __AVX512F__
// Declare the variable for the content of weights, input and alpha
// values.
__m512 w_content, x_content, alpha_content;
// Load the relevant data into the variables from un-aligned address.
w_content = _mm512_loadu_ps((const float*)(w + i_offset));
x_content = _mm512_loadu_ps((const float*)(x + i_offset));
alpha_content = _mm512_add_ps(w_content, x_content);
// Save the alpha value.
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
alpha_content);
#else
// Declare the variable for the content of weights, input and alpha
// values.
__m256 w_content, x_content, alpha_content;
// Load the relevant data into the variables from un-aligned address.
w_content = _mm256_loadu_ps((const float*)(w + i_offset));
x_content = _mm256_loadu_ps((const float*)(x + i_offset));
alpha_content = _mm256_add_ps(w_content, x_content);
// Save the alpha value.
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
alpha_content);
#endif
i_offset += step_size;
if (i == steps - 1) {
if (remain > 0) {
i_offset += last_offset;
} else {
break;
}
}
}
// Use the column-major strategy to get the location of maximum score.
size_t seq_offset = 0;
for (size_t k = 1; k < seq_len; ++k) {
size_t j_offset = 0;
for (size_t j = 0; j <= steps; ++j) {
#ifdef __AVX512F__
// Initialize the variables of maximum score and location.
__m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
__m512i max_j = _mm512_setzero_si512();
#else
// Initialize the variables of maximum score and location.
__m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
__m256i max_j = _mm256_set1_epi32(0);
#endif
// Calculate the offset of transition_weights.
size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
for (size_t i = 0; i < tag_num; ++i) {
#ifdef __AVX512F__
// Initalize the content of alpha variable with related offset.
__m512 alpha_content =
_mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
// Obtain the content of weights from un-aligned address.
__m512 w_content =
_mm512_loadu_ps((const float*)(w + trans_offset));
__m512 score_v = _mm512_add_ps(alpha_content, w_content);
__mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm512_mask_set1_epi32(max_j, mask, i);
// Update the max_score value.
max_score = _mm512_max_ps(max_score, score_v);
#else
// Initalize the content of alpha variable with related offset.
__m256 alpha_content = _mm256_broadcast_ss(
(const float*)(alpha_value + seq_offset + i));
// Obtain the content of weights from un-aligned address.
__m256 w_content =
_mm256_loadu_ps((const float*)(w + trans_offset));
__m256 score_v = _mm256_add_ps(alpha_content, w_content);
__m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
#ifdef __AVX2__
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm256_or_si256(
_mm256_andnot_si256((__m256i)mask, max_j),
_mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
#else
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
__m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
__m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
#endif
// Update the max_score value.
max_score = _mm256_max_ps(max_score, score_v);
#endif
trans_offset += tag_num;
}
#ifdef __AVX512F__
// Update the alpha and track values.
__m512 x_content = _mm512_loadu_ps(
(const float*)(x + seq_offset + tag_num + j_offset));
max_score = _mm512_add_ps(max_score, x_content);
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
tag_num + j_offset),
max_score);
_mm512_storeu_si512(
reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
j_offset),
max_j);
#else
// Update the alpha and track values.
__m256 x_content = _mm256_loadu_ps(
(const float*)(x + seq_offset + tag_num + j_offset));
max_score = _mm256_add_ps(max_score, x_content);
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
tag_num + j_offset),
max_score);
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
j_offset),
max_j);
#endif
// Calculate the offset of next step
j_offset += step_size;
if (j == steps - 1) {
if (remain > 0) {
j_offset += last_offset;
} else {
break;
}
}
}
seq_offset += tag_num;
}
} else {
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
for (size_t k = 1; k < seq_len; ++k) {
for (size_t i = 0; i < tag_num; ++i) {
T max_score = -std::numeric_limits<T>::max();
int max_j = 0;
for (size_t j = 0; j < tag_num; ++j) {
T score = alpha_value[(k - 1) * tag_num + j] +
w[(j + state_trans_base_idx) * tag_num + i];
if (score > max_score) {
max_score = score;
max_j = j;
}
}
alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
track_value[k * tag_num + i] = max_j;
}
}
}
#else
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
for (size_t k = 1; k < seq_len; ++k) {
for (size_t i = 0; i < tag_num; ++i) {
T max_score = -std::numeric_limits<T>::max();
int max_j = 0;
for (size_t j = 0; j < tag_num; ++j) {
T score = alpha_value[(k - 1) * tag_num + j] +
w[(j + state_trans_base_idx) * tag_num + i];
if (score > max_score) {
max_score = score;
max_j = j;
}
}
alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
track_value[k * tag_num + i] = max_j;
}
}
#endif
const auto& ker = math::jitkernel::KernelPool::Instance()
.template Get<math::jitkernel::CRFDecodeKernel<T>>(
static_cast<int>(tag_num));
ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
T max_score = -std::numeric_limits<T>::max();
int max_i = 0;
for (size_t i = 0; i < tag_num; ++i) {

@ -76,6 +76,6 @@ endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
DEPS cpu_info cblas)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)

@ -151,6 +151,13 @@ class GRUKernel : public Kernel {
virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
};
template <typename T>
class CRFDecodeKernel : public Kernel {
public:
virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha,
int *track) const = 0;
};
} // namespace jitkernel
} // namespace math
} // namespace operators

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save