|
|
|
@ -14,6 +14,8 @@ limitations under the License. */
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
#include <type_traits>
|
|
|
|
|
#include "paddle/fluid/framework/eigen.h"
|
|
|
|
|
#include "paddle/fluid/operators/activation_op.h"
|
|
|
|
|
#include "paddle/fluid/operators/math/detail/activation_functions.h"
|
|
|
|
|
#include "paddle/fluid/operators/math/gru_compute.h"
|
|
|
|
|
|
|
|
|
@ -21,6 +23,10 @@ namespace paddle {
|
|
|
|
|
namespace operators {
|
|
|
|
|
namespace math {
|
|
|
|
|
namespace detail {
|
|
|
|
|
using Array1 = Eigen::DSizes<int64_t, 1>;
|
|
|
|
|
template <typename T, int MajorType = Eigen::RowMajor,
|
|
|
|
|
typename IndexType = Eigen::DenseIndex>
|
|
|
|
|
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
|
|
|
|
|
|
|
|
|
|
#ifndef __NVCC__
|
|
|
|
|
|
|
|
|
@ -242,23 +248,46 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
|
|
|
|
|
GRUMetaValue<T> value, int frame_size) {
|
|
|
|
|
auto &place = *context.eigen_device();
|
|
|
|
|
auto value_reset_gate =
|
|
|
|
|
typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
|
|
|
|
|
auto value_update_gate = typename EigenVector<T>::Type(
|
|
|
|
|
value.gate_value + frame_size, Array1(frame_size));
|
|
|
|
|
auto value_reset_output = typename EigenVector<T>::Type(
|
|
|
|
|
value.reset_output_value, Array1(frame_size));
|
|
|
|
|
auto value_reset_bias =
|
|
|
|
|
typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
|
|
|
|
|
SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
|
|
|
|
|
SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
|
|
|
|
|
value_reset_output.device(place) =
|
|
|
|
|
(value_reset_output + value_reset_bias) * value_reset_gate;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class OpResetOutput, typename T>
|
|
|
|
|
inline void forward_reset_output(OpResetOutput op_reset_output,
|
|
|
|
|
GRUMetaValue<T> value, int frame_size,
|
|
|
|
|
int batch_size, ActivationType active_gate,
|
|
|
|
|
bool old_version = true) {
|
|
|
|
|
inline void forward_reset_output(
|
|
|
|
|
OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
|
|
|
|
|
int batch_size, ActivationType active_gate, bool old_version = true,
|
|
|
|
|
const platform::CPUDeviceContext *context = nullptr) {
|
|
|
|
|
for (int b = 0; b < batch_size; b++) {
|
|
|
|
|
if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
|
|
|
|
|
(sizeof(T) == 4)) {
|
|
|
|
|
hl_avx_gru_forward_reset_output(
|
|
|
|
|
op_reset_output, value.gate_value, value.reset_output_value,
|
|
|
|
|
value.prev_out_value, frame_size, active_gate, old_version,
|
|
|
|
|
value.reset_bias);
|
|
|
|
|
if (!old_version) {
|
|
|
|
|
// use eigen
|
|
|
|
|
forward_reset_outputV2(*context, value, frame_size);
|
|
|
|
|
} else {
|
|
|
|
|
hl_naive_gru_forward_reset_output(
|
|
|
|
|
op_reset_output, value.gate_value, value.reset_output_value,
|
|
|
|
|
value.prev_out_value, frame_size, active_gate, old_version,
|
|
|
|
|
value.reset_bias);
|
|
|
|
|
if (OpResetOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
|
|
|
|
|
(sizeof(T) == 4)) {
|
|
|
|
|
hl_avx_gru_forward_reset_output(
|
|
|
|
|
op_reset_output, value.gate_value, value.reset_output_value,
|
|
|
|
|
value.prev_out_value, frame_size, active_gate, old_version,
|
|
|
|
|
value.reset_bias);
|
|
|
|
|
} else {
|
|
|
|
|
hl_naive_gru_forward_reset_output(
|
|
|
|
|
op_reset_output, value.gate_value, value.reset_output_value,
|
|
|
|
|
value.prev_out_value, frame_size, active_gate, old_version,
|
|
|
|
|
value.reset_bias);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
value.gate_value += frame_size * 3;
|
|
|
|
|
value.reset_output_value += frame_size;
|
|
|
|
@ -268,25 +297,51 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
|
|
|
|
|
GRUMetaValue<T> value, int frame_size) {
|
|
|
|
|
auto &place = *context.eigen_device();
|
|
|
|
|
auto value_update_gate = typename EigenVector<T>::Type(
|
|
|
|
|
value.gate_value + frame_size, Array1(frame_size));
|
|
|
|
|
auto value_frame_state = typename EigenVector<T>::Type(
|
|
|
|
|
value.gate_value + 2 * frame_size, Array1(frame_size));
|
|
|
|
|
auto value_output =
|
|
|
|
|
typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
|
|
|
|
|
TanhFunctor<T>()(place, value_frame_state, value_frame_state);
|
|
|
|
|
value_output.device(place) =
|
|
|
|
|
(static_cast<T>(1.0) - value_update_gate) * value_frame_state;
|
|
|
|
|
if (value.prev_out_value) {
|
|
|
|
|
auto value_prev_out = typename EigenVector<T>::ConstType(
|
|
|
|
|
value.prev_out_value, Array1(frame_size));
|
|
|
|
|
value_output.device(place) =
|
|
|
|
|
value_output + value_update_gate * value_prev_out;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class OpFinalOutput, typename T>
|
|
|
|
|
inline void forward_final_output(OpFinalOutput op_final_output,
|
|
|
|
|
GRUMetaValue<T> value, int frame_size,
|
|
|
|
|
int batch_size, ActivationType active_node,
|
|
|
|
|
bool origin_mode, bool old_version = true) {
|
|
|
|
|
inline void forward_final_output(
|
|
|
|
|
OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
|
|
|
|
|
int batch_size, ActivationType active_node, bool origin_mode,
|
|
|
|
|
bool old_version = true,
|
|
|
|
|
const platform::CPUDeviceContext *context = nullptr) {
|
|
|
|
|
for (int b = 0; b < batch_size; b++) {
|
|
|
|
|
if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
|
|
|
|
|
(sizeof(T) == 4)) {
|
|
|
|
|
hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
|
|
|
|
|
value.prev_out_value, value.output_value,
|
|
|
|
|
frame_size, active_node, origin_mode,
|
|
|
|
|
old_version);
|
|
|
|
|
if (!old_version) {
|
|
|
|
|
// eigen
|
|
|
|
|
forward_final_outputV2(*context, value, frame_size);
|
|
|
|
|
} else {
|
|
|
|
|
hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
|
|
|
|
|
if (OpFinalOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
|
|
|
|
|
(sizeof(T) == 4)) {
|
|
|
|
|
hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
|
|
|
|
|
value.prev_out_value,
|
|
|
|
|
value.output_value, frame_size,
|
|
|
|
|
active_node, origin_mode, old_version);
|
|
|
|
|
} else {
|
|
|
|
|
hl_naive_gru_forward_final_output(
|
|
|
|
|
op_final_output, value.gate_value, value.prev_out_value,
|
|
|
|
|
value.output_value, frame_size, active_node, origin_mode,
|
|
|
|
|
old_version);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
value.gate_value += frame_size * 3;
|
|
|
|
|
value.output_value += frame_size;
|
|
|
|
|
if (value.prev_out_value) {
|
|
|
|
@ -664,23 +719,70 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
inline void gru_backward(const platform::CPUDeviceContext &context,
|
|
|
|
|
GRUMetaValue<T> value, GRUMetaGrad<T> grad,
|
|
|
|
|
int frame_size) {
|
|
|
|
|
auto &place = *context.eigen_device();
|
|
|
|
|
|
|
|
|
|
auto value_reset_gate =
|
|
|
|
|
typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
|
|
|
|
|
auto grad_reset_gate =
|
|
|
|
|
typename EigenVector<T>::Type(grad.gate_grad, Array1(frame_size));
|
|
|
|
|
auto value_update_gate = typename EigenVector<T>::Type(
|
|
|
|
|
value.gate_value + frame_size, Array1(frame_size));
|
|
|
|
|
auto grad_update_gate = typename EigenVector<T>::Type(
|
|
|
|
|
grad.gate_grad + frame_size, Array1(frame_size));
|
|
|
|
|
auto value_frame_state = typename EigenVector<T>::Type(
|
|
|
|
|
value.gate_value + frame_size * 2, Array1(frame_size));
|
|
|
|
|
auto grad_frame_state = typename EigenVector<T>::Type(
|
|
|
|
|
grad.gate_grad + frame_size * 2, Array1(frame_size));
|
|
|
|
|
|
|
|
|
|
auto grad_output =
|
|
|
|
|
typename EigenVector<T>::Type(grad.output_grad, Array1(frame_size));
|
|
|
|
|
auto value_reset_output = typename EigenVector<T>::Type(
|
|
|
|
|
value.reset_output_value, Array1(frame_size));
|
|
|
|
|
auto grad_reset_output =
|
|
|
|
|
typename EigenVector<T>::Type(grad.reset_output_grad, Array1(frame_size));
|
|
|
|
|
|
|
|
|
|
if (value.prev_out_value) {
|
|
|
|
|
auto value_prev_out = typename EigenVector<T>::ConstType(
|
|
|
|
|
value.prev_out_value, Array1(frame_size));
|
|
|
|
|
SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
|
|
|
|
|
(value_prev_out - value_frame_state) * grad_output,
|
|
|
|
|
grad_update_gate);
|
|
|
|
|
} else {
|
|
|
|
|
SigmoidGradFunctor<T>()(
|
|
|
|
|
place, 1 /*useless*/, value_update_gate,
|
|
|
|
|
static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
|
|
|
|
|
}
|
|
|
|
|
if (grad.prev_out_grad) {
|
|
|
|
|
auto grad_prev_out =
|
|
|
|
|
typename EigenVector<T>::Type(grad.prev_out_grad, Array1(frame_size));
|
|
|
|
|
grad_prev_out.device(place) =
|
|
|
|
|
grad_prev_out + grad_output * value_update_gate;
|
|
|
|
|
}
|
|
|
|
|
TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
|
|
|
|
|
grad_output * (static_cast<T>(1.0) - value_update_gate),
|
|
|
|
|
grad_frame_state);
|
|
|
|
|
SigmoidGradFunctor<T>()(
|
|
|
|
|
place, 1 /*useless*/, value_reset_gate,
|
|
|
|
|
value_reset_output / value_reset_gate * grad_frame_state,
|
|
|
|
|
grad_reset_gate);
|
|
|
|
|
if (value.prev_out_value && grad.prev_out_grad) {
|
|
|
|
|
grad_reset_output.device(place) = value_reset_gate * grad_frame_state;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class OpGruGrad, typename T>
|
|
|
|
|
inline void cpu_gru_backward(OpGruGrad op_gru_grad, GRUMetaValue<T> value,
|
|
|
|
|
inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
|
|
|
|
|
OpGruGrad op_gru_grad, GRUMetaValue<T> value,
|
|
|
|
|
GRUMetaGrad<T> grad, int frame_size,
|
|
|
|
|
int batch_size, ActivationType active_node,
|
|
|
|
|
ActivationType active_gate) {
|
|
|
|
|
for (int b = 0; b < batch_size; ++b) {
|
|
|
|
|
if (OpGruGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
|
|
|
|
|
hl_avx_gru_backward(
|
|
|
|
|
op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
|
|
|
|
|
grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
|
|
|
|
|
grad.output_grad, frame_size, active_node, active_gate);
|
|
|
|
|
} else {
|
|
|
|
|
hl_naive_gru_backward(
|
|
|
|
|
op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
|
|
|
|
|
grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
|
|
|
|
|
grad.output_grad, frame_size, active_node, active_gate);
|
|
|
|
|
}
|
|
|
|
|
// eigen
|
|
|
|
|
gru_backward(context, value, grad, frame_size);
|
|
|
|
|
|
|
|
|
|
value.gate_value += frame_size * 3;
|
|
|
|
|
value.reset_output_value += frame_size;
|
|
|
|
|