Add LSTM, Simple RNN and GRU CPU kernel (#28577)

* add lstm, simple rnn op kernel * fix the test_lstm for the rnn op * change func name * fix forward postprocess bug * add gru forward, backward code * remove unittest.skipIf; use a big rnn op instead of combination op * fix input doesn't have gradient bug * add eigen lstm forward, backward Co-authored-by: wawltor <fangzeyang0904@hotmail.com>
5 years ago · 9362d85e0e
parent 30ef3815b3
commit 9362d85e0e
24 changed files with 3376 additions and 175 deletions
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@ -30,18 +30,24 @@ namespace detail {
 enum ActivationType {
  kSigmoid,
  KSigmoidV2,
  kReLU,
  kTanh,
  kTanhV2,
  kIdentity,
 };
 inline ActivationType GetActivationType(const std::string &type) {
  if (type == "sigmoid") {
    return ActivationType::kSigmoid;
  } else if (type == "sigmoid_v2") {
    return ActivationType::KSigmoidV2;
  } else if (type == "relu") {
    return ActivationType::kReLU;
  } else if (type == "tanh") {
    return ActivationType::kTanh;
  } else if (type == "tanh_v2") {
    return ActivationType::kTanhV2;
  } else if (type == "identity" || type == "") {
    return ActivationType::kIdentity;
  }
@ -68,6 +74,14 @@ DEVICE T Sigmoid(const T a) {
  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
 }
 /*
 * Don't limit input in a threshold range.
 */
 template <typename T>
 DEVICE T SigmoidV2(const T a) {
  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-a));
 }
 template <typename T>
 DEVICE T Tanh(const T a) {
  T tmp = -2.0 * a;
@ -75,6 +89,15 @@ DEVICE T Tanh(const T a) {
  return (2.0 / (1.0 + exp(tmp))) - 1.0;
 }
 /*
 * Don't limit input in a threshold range.
 */
 template <typename T>
 DEVICE T TanhV2(const T a) {
  T tmp = -2.0 * a;
  return (2.0 / (1.0 + exp(tmp))) - 1.0;
 }
 }  // namespace forward
 namespace backward {
@ -108,20 +131,24 @@ struct Active {
 };
 static DEVICE Active<float>::Act kActFloat[] = {
-    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
+    &forward::Sigmoid<float>, &forward::SigmoidV2<float>,
-    &forward::Identity<float>};
+    &forward::Relu<float>,    &forward::Tanh<float>,
    &forward::TanhV2<float>,  &forward::Identity<float>};
 static DEVICE Active<float>::ActGrad kActGradFloat[] = {
-    &backward::Sigmoid<float>, &backward::Relu<float>, &backward::Tanh<float>,
+    &backward::Sigmoid<float>, &backward::Sigmoid<float>,
-    &backward::Identity<float>};
+    &backward::Relu<float>,    &backward::Tanh<float>,
    &backward::Tanh<float>,    &backward::Identity<float>};
 static DEVICE Active<double>::Act kActDouble[] = {
-    &forward::Sigmoid<double>, &forward::Relu<double>, &forward::Tanh<double>,
+    &forward::Sigmoid<double>, &forward::SigmoidV2<double>,
-    &forward::Identity<double>};
+    &forward::Relu<double>,    &forward::Tanh<double>,
    &forward::TanhV2<double>,  &forward::Identity<double>};
 static DEVICE Active<double>::ActGrad kActGradDouble[] = {
-    &backward::Sigmoid<double>, &backward::Relu<double>,
+    &backward::Sigmoid<double>, &backward::Sigmoid<double>,
-    &backward::Tanh<double>, &backward::Identity<double>};
+    &backward::Relu<double>,    &backward::Tanh<double>,
    &backward::Tanh<double>,    &backward::Identity<double>};
 namespace forward {
 inline DEVICE float activation(float a, int index) {
@ -149,7 +176,9 @@ namespace forward {
 namespace avx {
 __m256 Relu(const __m256 a);
 __m256 Sigmoid(const __m256 a);
 __m256 SigmoidV2(const __m256 a);
 __m256 Tanh(const __m256 a);
 __m256 TanhV2(const __m256 a);
 __m256 Identity(const __m256 a);
 }  // namespace avx
 }  // namespace forward
@ -164,12 +193,12 @@ __m256 Identity(const __m256 a, const __m256 b);
 }  // namespace backward
 static Active<__m256>::Act kActAvx[] = {
-    &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh,
+    &forward::avx::Sigmoid, &forward::avx::SigmoidV2, &forward::avx::Relu,
-    &forward::avx::Identity};
+    &forward::avx::Tanh,    &forward::avx::TanhV2,    &forward::avx::Identity};
 static Active<__m256>::ActGrad kActGradAvx[] = {
-    &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh,
+    &backward::avx::Sigmoid, &backward::avx::Sigmoid, &backward::avx::Relu,
-    &backward::avx::Identity};
+    &backward::avx::Tanh,    &backward::avx::Tanh,    &backward::avx::Identity};
 namespace forward {
 inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@ -43,6 +43,13 @@ __m256 Sigmoid(const __m256 a) {
  return tmp;
 }
 __m256 SigmoidV2(const __m256 a) {
  __m256 tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), a);
  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), exp256_ps(tmp));
  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
  return tmp;
 }
 __m256 Tanh(const __m256 a) {
  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
@ -53,6 +60,14 @@ __m256 Tanh(const __m256 a) {
                       _mm256_set1_ps(1.0f));
 }
 __m256 TanhV2(const __m256 a) {
  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
  return _mm256_sub_ps(
      _mm256_div_ps(_mm256_set1_ps(2.0f),
                    _mm256_add_ps(_mm256_set1_ps(1.0f), exp256_ps(tmp))),
      _mm256_set1_ps(1.0f));
 }
 __m256 Identity(const __m256 a) { return a; }
 }  // namespace avx
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@ -31,8 +31,8 @@ namespace detail {
 template <class OpResetOutput, bool is_batch, typename T>
 __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
                                        T *gate_value, T *reset_output_value,
-                                        T *prev_output_value, int frame_size,
+                                        const T *prev_output_value,
-                                        int batch_size,
+                                        int frame_size, int batch_size,
                                        ActivationType active_gate) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;
@ -68,12 +68,10 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
 * grid(frame_blocks, batch_blocks)
 */
 template <class OpFinalOutput, bool is_batch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+__global__ void KeGruForwardFinalOutput(
-                                        T *gate_value, T *prev_output_value,
+    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
-                                        T *output_value, int frame_size,
+    T *output_value, int frame_size, int batch_size, ActivationType active_node,
-                                        int batch_size,
+    bool origin_mode) {
                                        ActivationType active_node,
                                        bool origin_mode) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;
  int batch_idx = 0;
@ -106,8 +104,9 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
 * grid(frame_blocks, 1)
 */
 template <class T, int Tiled_size>
-__global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
+__global__ void KeFastCollectiveGruGate(T *gate_value,
-                                        T *gate_weight, T *reset_output,
+                                        const T *prev_output_value,
                                        const T *gate_weight, T *reset_output,
                                        int frame_size,
                                        ActivationType active_node) {
  T xt_0 = 0.0f;
@ -164,10 +163,10 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
 * grid(frame_blocks, 1)
 */
 template <class T, int Tiled_size>
-__global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
+__global__ void KeFastCollectiveGruOut(const T *gate_weight,
-                                       T *output_value, T *gate_value,
+                                       const T *prev_out_value, T *output_value,
-                                       T *reset_value, int frame_size,
+                                       T *gate_value, T *reset_value,
-                                       ActivationType act_node,
+                                       int frame_size, ActivationType act_node,
                                       bool origin_mode) {
  int COL = blockIdx.x * blockDim.x + threadIdx.x;
@ -223,7 +222,7 @@ __global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
 */
 template <class OpStateGrad, bool is_batch, typename T>
 __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
+                                       T *gate_grad, const T *prev_out_value,
                                       T *prev_out_grad, T *output_grad,
                                       int frame_size, int batch_size,
                                       ActivationType active_node,
@ -272,7 +271,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
 */
 template <class OpResetGrad, bool is_batch, typename T>
 __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
+                                       T *gate_grad, const T *prev_out_value,
                                       T *prev_out_grad, T *reset_output_grad,
                                       int frame_size, int batch_size,
                                       ActivationType active_gate) {
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@ -30,10 +30,17 @@ class gru_resetOutput {
 public:
  HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
                             T *prev_out, T *value_reset_output,
-                             ActivationType act_gate) {
+                             ActivationType act_gate,
                             T *value_reset_bias = nullptr,
                             bool old_version = true) {
    *value_update_gate = activation(*value_update_gate, act_gate);
    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = (*prev_out) * (*value_reset_gate);
+    if (old_version) {
      *value_reset_output = (*prev_out) * (*value_reset_gate);
    } else {
      *value_reset_output =
          (*value_reset_output + *value_reset_bias) * (*value_reset_gate);
    }
  }
 #ifndef __NVCC__
 #ifndef __AVX__
@ -43,10 +50,19 @@ class gru_resetOutput {
  HOSTDEVICE void operator()(__m256 *value_update_gate,
                             __m256 *value_reset_gate, __m256 *prev_out,
                             __m256 *value_reset_output,
-                             ActivationType act_gate) {
+                             ActivationType act_gate,
                             __m256 *value_reset_bias = nullptr,
                             bool old_version = true) {
    *value_update_gate = activation(*value_update_gate, act_gate);
    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
+    if (old_version) {
      *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
    } else {
      *value_reset_output =
          _mm256_add_ps(*value_reset_output, *value_reset_bias);
      *value_reset_output =
          _mm256_mul_ps(*value_reset_output, *value_reset_gate);
    }
  }
 #endif
 #endif
@ -192,6 +208,61 @@ class gru_resetGrad {
 #endif
 #endif
 };
 template <typename T>
 class gru {
 public:
  HOSTDEVICE void operator()(T *value_reset_gate, T *grad_reset_gate,
                             T *value_update_gate, T *grad_update_gate,
                             T *value_frame_state, T *grad_frame_state,
                             T *value_prev_out, T *grad_prev_out,
                             T *grad_output, T *value_reset_output,
                             T *grad_reset_output, ActivationType act_node,
                             ActivationType act_gate) {
    *grad_update_gate =
        activation((*grad_output) * ((*value_prev_out) - (*value_frame_state)),
                   (*value_update_gate), act_gate);
    *grad_prev_out += (*grad_output * (*value_update_gate));
    *grad_frame_state =
        activation(*grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
                   *value_frame_state, act_node);
    T reset_output = (*value_reset_output) / (*value_reset_gate);
    *grad_reset_gate = activation(reset_output * (*grad_frame_state),
                                  *value_reset_gate, act_gate);
    *grad_reset_output = (*value_reset_gate) * (*grad_frame_state);
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
  HOSTDEVICE void operator()(__m256 *value_reset_gate, __m256 *grad_reset_gate,
                             __m256 *value_update_gate,
                             __m256 *grad_update_gate,
                             __m256 *value_frame_state,
                             __m256 *grad_frame_state, __m256 *value_prev_out,
                             __m256 *grad_prev_out, __m256 *grad_output,
                             __m256 *value_reset_output,
                             __m256 *grad_reset_output, ActivationType act_node,
                             ActivationType act_gate) {
    *grad_update_gate = activation(
        _mm256_mul_ps(*grad_output,
                      _mm256_sub_ps(*value_prev_out, *value_frame_state)),
        *value_update_gate, act_gate);
    *grad_prev_out = _mm256_add_ps(
        *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
    *grad_frame_state = activation(
        _mm256_mul_ps(*grad_output,
                      _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)),
        *value_frame_state, act_node);
    __m256 reset_output = _mm256_div_ps(*value_reset_output, *value_reset_gate);
    *grad_reset_gate =
        activation(_mm256_mul_ps(reset_output, *grad_frame_state),
                   *value_reset_gate, act_gate);
    *grad_reset_output = _mm256_mul_ps(*value_reset_gate, *grad_frame_state);
  }
 #endif
 #endif
 };
 }  // namespace backward
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@ -11,6 +11,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
@ -101,11 +102,64 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
  }
 };
 template <typename T>
 struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
  static void compute(const platform::CPUDeviceContext &context,
                      GRUMetaValue<T> value, int frame_size, int batch_size,
                      const detail::ActivationType active_node,
                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
    if (value.prev_out_value) {
      blas.GEMM(CblasNoTrans, CblasTrans, batch_size, frame_size, frame_size, 1,
                value.prev_out_value, value.state_weight, 0,
                value.reset_output_value);
    }
    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
                                 frame_size, batch_size, active_gate, false);
    T *cell_state_value = value.gate_value + 2 * frame_size;
    T *reset_output_value = value.reset_output_value;
    for (int b = 0; b < batch_size; ++b) {
      blas.VADD(frame_size, cell_state_value, reset_output_value,
                cell_state_value);
      cell_state_value += frame_size * 3;
      reset_output_value += frame_size;
    }
    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                 frame_size, batch_size, active_node, true,
                                 false);
 #endif
  }
 };
 template <typename T>
 struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
  static void compute(const platform::CPUDeviceContext &context,
                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                      int frame_size, int batch_size,
                      const detail::ActivationType active_node,
                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
    // calculate grad_update_gate, grad_frame_state,
    // grad_reset_output, grad_reset_gate
    detail::cpu_gru_backward(detail::backward::gru<T>(), value, grad,
                             frame_size, batch_size, active_node, active_gate);
 #endif
  }
 };
 template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
 template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
 template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
 template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
 template struct GRUUnitFunctorV2<platform::CPUDeviceContext, float>;
 template struct GRUUnitFunctorV2<platform::CPUDeviceContext, double>;
 template struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, float>;
 template struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
@ -21,12 +21,13 @@ namespace math {
 template <typename T>
 struct GRUMetaValue {
-  T *gate_weight;
+  const T *gate_weight;
-  T *state_weight;
+  const T *state_weight;
  const T *reset_bias;
  T *gate_value;
  T *reset_output_value;
  T *output_value;
-  T *prev_out_value;
+  const T *prev_out_value;
 };
 template <typename T>
@ -37,6 +38,7 @@ struct GRUMetaGrad {
  T *reset_output_grad;
  T *output_grad;
  T *prev_out_grad;
  T *state_bias_grad;
 };
 template <typename DeviceContext, typename T>
@ -57,6 +59,22 @@ struct GRUUnitGradFunctor {
                      bool origin_mode);
 };
 template <typename DeviceContext, typename T>
 struct GRUUnitFunctorV2 {
  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
                      int frame_size, int batch_size,
                      const detail::ActivationType active_node,
                      const detail::ActivationType active_gate);
 };
 template <typename DeviceContext, typename T>
 struct GRUUnitGradFunctorV2 {
  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
                      const detail::ActivationType active_node,
                      const detail::ActivationType active_gate);
 };
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@ -33,10 +33,12 @@ struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
                      LstmMetaValue<T> value, int frame_size, int batch_size,
                      T cell_clip, const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
                      bool old_api_version = true) {
    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
+      detail::cpu_lstm_forward(context, detail::forward::lstm<T>(), value,
-                               cell_clip, cand_act, gate_act, cell_act);
+                               frame_size, cell_clip, cand_act, gate_act,
                               cell_act, old_api_version);
      value.gate_value += frame_size * 4;
      value.state_value += frame_size;
      value.state_active_value += frame_size;
@ -55,11 +57,12 @@ struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
                      bool old_api_version = true) {
    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
+      detail::cpu_lstm_backward(context, detail::backward::lstm<T>(), value,
-                                frame_size, cell_clip, cand_act, gate_act,
+                                grad, frame_size, cell_clip, cand_act, gate_act,
-                                cell_act);
+                                cell_act, old_api_version);
      value.gate_value += frame_size * 4;
      value.state_value += frame_size;
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@ -26,7 +26,8 @@ struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
                      LstmMetaValue<T> value, int frame_size, int batch_size,
                      T cell_clip, const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
                      bool old_api_version = true) {
    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
                                frame_size, batch_size, cell_clip, cand_act,
                                gate_act, cell_act);
@ -40,7 +41,8 @@ struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
                      bool old_api_version = true) {
    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
                              frame_size, batch_size, cell_clip, cand_act,
                              gate_act, cell_act);
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
@ -25,7 +25,7 @@ namespace math {
 template <class T>
 struct LstmMetaValue {
  T *gate_value;
-  T *prev_state_value;
+  const T *prev_state_value;
  T *state_value;
  T *state_active_value;
  T *output_value;
@ -53,7 +53,8 @@ class LstmUnitFunctor {
                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType &gate_act,
                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
+                      const detail::ActivationType &cand_act,
                      bool old_api_version = true);
 };
 template <typename DeviceContext, typename T>
@ -63,7 +64,8 @@ class LstmUnitGradFunctor {
                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
                      T cell_clip, const detail::ActivationType &gate_act,
                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
+                      const detail::ActivationType &cand_act,
                      bool old_api_version = true);
 };
 }  // namespace math
--- a/paddle/fluid/operators/rnn_op.cc
+++ b/paddle/fluid/operators/rnn_op.cc
@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/rnn_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
@ -251,5 +252,10 @@ REGISTER_OPERATOR(rnn, ops::RNNOp, ops::RNNOpMaker,
                  ops::RNNGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(rnn_grad, ops::RNNGradOp);
-REGISTER_OP_CPU_KERNEL(rnn, ops::NotImpleKernel<float>);
+REGISTER_OP_CPU_KERNEL(
-REGISTER_OP_CPU_KERNEL(rnn_grad, ops::NotImpleKernel<float>);
+    rnn, ops::RNNCPUKernel<paddle::platform::CPUDeviceContext, float>,
    ops::RNNCPUKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    rnn_grad, ops::RNNCPUGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::RNNCPUGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@ -524,6 +524,12 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
      offset += len;
    }
    Tensor input_grad_value;
    if (!in_grad) {
      in_grad = &input_grad_value;
      in_grad->Resize(input->dims());
    }
    auto *init_h_data = pre_state[0]->data<T>();
    // auto *last_h_data = state[0]->data<T>();
    auto *last_h_grad_data = state_grad[0]->data<T>();
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
@ -65,10 +65,18 @@ class TestLstm(unittest.TestCase):
        paddle.jit.ProgramTranslator().enable(True)
        net = Net(12, 2)
        x = paddle.randn((2, 10, 12))
        x.stop_gradient = False
        dygraph_out = net(x)
        loss = paddle.mean(dygraph_out)
        sgd = paddle.optimizer.SGD(learning_rate=0.001,
                                   parameters=net.parameters())
        loss.backward()
        sgd.step()
        # switch eval mode firstly
        net.eval()
-
+        x = paddle.randn((2, 10, 12))
        dygraph_out = net(x)
        dropout_out = net(x)
        net = paddle.jit.to_static(
            net, input_spec=[paddle.static.InputSpec(shape=[-1, 10, 12])])
        paddle.jit.save(net, 'simple_lstm')
@ -106,6 +114,14 @@ class TestSaveInEvalMode(unittest.TestCase):
    def test_save_in_eval(self):
        paddle.jit.ProgramTranslator().enable(True)
        net = LinearNet()
        x = paddle.randn((2, 10))
        x.stop_gradient = False
        dygraph_out = net(x)
        loss = paddle.mean(dygraph_out)
        sgd = paddle.optimizer.SGD(learning_rate=0.001,
                                   parameters=net.parameters())
        loss.backward()
        sgd.step()
        # switch eval mode firstly
        net.eval()
        # save directly
@ -129,6 +145,14 @@ class TestEvalAfterSave(unittest.TestCase):
    def test_eval_after_save(self):
        x = paddle.randn((2, 10, 12)).astype('float32')
        net = Net(12, 2)
        x.stop_gradient = False
        dy_out = net(x)
        loss = paddle.mean(dy_out)
        sgd = paddle.optimizer.SGD(learning_rate=0.001,
                                   parameters=net.parameters())
        loss.backward()
        sgd.step()
        x = paddle.randn((2, 10, 12)).astype('float32')
        dy_out = net(x)
        # save model
        paddle.jit.save(net, 'jit.save/lstm', input_spec=[x])
--- a/python/paddle/fluid/tests/unittests/rnn/convert.py
+++ b/python/paddle/fluid/tests/unittests/rnn/convert.py
@ -49,3 +49,34 @@ def convert_params_for_net_static(np_net, paddle_net, place):
                                           paddle_layer.cell_fw, place)
            convert_params_for_cell_static(np_layer.cell_bw,
                                           paddle_layer.cell_bw, place)
 def get_params_for_cell(np_cell, num_layers, idx):
    state = np_cell.parameters
    weight_list = [
        ('{}.weight_{}'.format(num_layers, idx), state['weight_ih']),
        ('{}.weight_{}'.format(num_layers, idx + 1), state['weight_hh'])
    ]
    bias_list = [('{}.bias_{}'.format(num_layers, idx), state['bias_ih']),
                 ('{}.bias_{}'.format(num_layers, idx + 1), state['bias_hh'])]
    return weight_list, bias_list
 def get_params_for_net(np_net):
    weight_list = []
    bias_list = []
    for layer_idx, np_layer in enumerate(np_net):
        if hasattr(np_layer, "cell"):
            weight, bias = get_params_for_cell(np_layer.cell, layer_idx, 0)
            for w, b in zip(weight, bias):
                weight_list.append(w)
                bias_list.append(b)
        else:
            for count, cell in enumerate([np_layer.cell_fw, np_layer.cell_bw]):
                weight, bias = get_params_for_cell(cell, layer_idx, count * 2)
                for w, b in zip(weight, bias):
                    weight_list.append(w)
                    bias_list.append(b)
    weight_list.extend(bias_list)
    return weight_list
--- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@ -33,11 +33,16 @@ class LayerListMixin(LayerMixin):
 class SimpleRNNCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+    def __init__(self,
                 input_size,
                 hidden_size,
                 bias=True,
                 nonlinearity="RNN_TANH",
                 dtype="float64"):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
-        if nonlinearity == 'tanh':
+        if nonlinearity == 'RNN_TANH':
            self.nonlinearity = np.tanh
        else:
            self.nonlinearity = lambda x: np.maximum(x, 0.)
@ -45,16 +50,16 @@ class SimpleRNNCell(LayerMixin):
        self.parameters = dict()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_ih = np.random.uniform(-std, std, (
-            hidden_size, input_size)).astype('float64')
+            hidden_size, input_size)).astype(dtype)
        self.weight_hh = np.random.uniform(-std, std, (
-            hidden_size, hidden_size)).astype('float64')
+            hidden_size, hidden_size)).astype(dtype)
        self.parameters['weight_ih'] = self.weight_ih
        self.parameters['weight_hh'] = self.weight_hh
        if bias:
            self.bias_ih = np.random.uniform(-std, std,
-                                             (hidden_size, )).astype('float64')
+                                             (hidden_size, )).astype(dtype)
            self.bias_hh = np.random.uniform(-std, std,
-                                             (hidden_size, )).astype('float64')
+                                             (hidden_size, )).astype(dtype)
            self.parameters['bias_ih'] = self.bias_ih
            self.parameters['bias_hh'] = self.bias_hh
        else:
@ -80,23 +85,23 @@ class SimpleRNNCell(LayerMixin):
 class GRUCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True):
+    def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.parameters = dict()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_ih = np.random.uniform(-std, std, (
-            3 * hidden_size, input_size)).astype('float64')
+            3 * hidden_size, input_size)).astype(dtype)
        self.weight_hh = np.random.uniform(-std, std, (
-            3 * hidden_size, hidden_size)).astype('float64')
+            3 * hidden_size, hidden_size)).astype(dtype)
        self.parameters['weight_ih'] = self.weight_ih
        self.parameters['weight_hh'] = self.weight_hh
        if bias:
-            self.bias_ih = np.random.uniform(-std, std, (
+            self.bias_ih = np.random.uniform(-std, std,
-                3 * hidden_size)).astype('float64')
+                                             (3 * hidden_size)).astype(dtype)
-            self.bias_hh = np.random.uniform(-std, std, (
+            self.bias_hh = np.random.uniform(-std, std,
-                3 * hidden_size)).astype('float64')
+                                             (3 * hidden_size)).astype(dtype)
            self.parameters['bias_ih'] = self.bias_ih
            self.parameters['bias_hh'] = self.bias_hh
        else:
@ -128,23 +133,23 @@ class GRUCell(LayerMixin):
 class LSTMCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True):
+    def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.parameters = dict()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_ih = np.random.uniform(-std, std, (
-            4 * hidden_size, input_size)).astype('float64')
+            4 * hidden_size, input_size)).astype(dtype)
        self.weight_hh = np.random.uniform(-std, std, (
-            4 * hidden_size, hidden_size)).astype('float64')
+            4 * hidden_size, hidden_size)).astype(dtype)
        self.parameters['weight_ih'] = self.weight_ih
        self.parameters['weight_hh'] = self.weight_hh
        if bias:
-            self.bias_ih = np.random.uniform(-std, std, (
+            self.bias_ih = np.random.uniform(-std, std,
-                4 * hidden_size)).astype('float64')
+                                             (4 * hidden_size)).astype(dtype)
-            self.bias_hh = np.random.uniform(-std, std, (
+            self.bias_hh = np.random.uniform(-std, std,
-                4 * hidden_size)).astype('float64')
+                                             (4 * hidden_size)).astype(dtype)
            self.parameters['bias_ih'] = self.bias_ih
            self.parameters['bias_hh'] = self.bias_hh
        else:
@ -403,28 +408,36 @@ class SimpleRNN(RNNMixin):
                 input_size,
                 hidden_size,
                 num_layers=1,
-                 nonlinearity="tanh",
+                 nonlinearity="RNN_TANH",
                 direction="forward",
                 dropout=0.,
-                 time_major=False):
+                 time_major=False,
                 dtype="float64"):
        super(SimpleRNN, self).__init__()
        if direction in ["forward", "backward"]:
            is_reverse = direction == "backward"
-            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell = SimpleRNNCell(
                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
            self.append(RNN(cell, is_reverse, time_major))
            for i in range(1, num_layers):
-                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
+                cell = SimpleRNNCell(
                    hidden_size,
                    hidden_size,
                    nonlinearity=nonlinearity,
                    dtype=dtype)
                self.append(RNN(cell, is_reverse, time_major))
        elif direction == "bidirectional":
-            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell_fw = SimpleRNNCell(
-            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
            cell_bw = SimpleRNNCell(
                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
            self.append(BiRNN(cell_fw, cell_bw, time_major))
            for i in range(1, num_layers):
-                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                cell_fw = SimpleRNNCell(
-                                        nonlinearity)
+                    2 * hidden_size, hidden_size, nonlinearity, dtype=dtype)
-                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                cell_bw = SimpleRNNCell(
-                                        nonlinearity)
+                    2 * hidden_size, hidden_size, nonlinearity, dtype=dtype)
                self.append(BiRNN(cell_fw, cell_bw, time_major))
        else:
            raise ValueError(
@ -447,23 +460,24 @@ class LSTM(RNNMixin):
                 num_layers=1,
                 direction="forward",
                 dropout=0.,
-                 time_major=False):
+                 time_major=False,
                 dtype="float64"):
        super(LSTM, self).__init__()
        if direction in ["forward", "backward"]:
            is_reverse = direction == "backward"
-            cell = LSTMCell(input_size, hidden_size)
+            cell = LSTMCell(input_size, hidden_size, dtype=dtype)
            self.append(RNN(cell, is_reverse, time_major))
            for i in range(1, num_layers):
-                cell = LSTMCell(hidden_size, hidden_size)
+                cell = LSTMCell(hidden_size, hidden_size, dtype=dtype)
                self.append(RNN(cell, is_reverse, time_major))
        elif direction == "bidirectional":
-            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_fw = LSTMCell(input_size, hidden_size, dtype=dtype)
-            cell_bw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size, dtype=dtype)
            self.append(BiRNN(cell_fw, cell_bw, time_major))
            for i in range(1, num_layers):
-                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size, dtype=dtype)
-                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size, dtype=dtype)
                self.append(BiRNN(cell_fw, cell_bw, time_major))
        else:
            raise ValueError(
@ -486,23 +500,24 @@ class GRU(RNNMixin):
                 num_layers=1,
                 direction="forward",
                 dropout=0.,
-                 time_major=False):
+                 time_major=False,
                 dtype="float64"):
        super(GRU, self).__init__()
        if direction in ["forward", "backward"]:
            is_reverse = direction == "backward"
-            cell = GRUCell(input_size, hidden_size)
+            cell = GRUCell(input_size, hidden_size, dtype=dtype)
            self.append(RNN(cell, is_reverse, time_major))
            for i in range(1, num_layers):
-                cell = GRUCell(hidden_size, hidden_size)
+                cell = GRUCell(hidden_size, hidden_size, dtype=dtype)
                self.append(RNN(cell, is_reverse, time_major))
        elif direction == "bidirectional":
-            cell_fw = GRUCell(input_size, hidden_size)
+            cell_fw = GRUCell(input_size, hidden_size, dtype=dtype)
-            cell_bw = GRUCell(input_size, hidden_size)
+            cell_bw = GRUCell(input_size, hidden_size, dtype=dtype)
            self.append(BiRNN(cell_fw, cell_bw, time_major))
            for i in range(1, num_layers):
-                cell_fw = GRUCell(2 * hidden_size, hidden_size)
+                cell_fw = GRUCell(2 * hidden_size, hidden_size, dtype=dtype)
-                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size, dtype=dtype)
                self.append(BiRNN(cell_fw, cell_bw, time_major))
        else:
            raise ValueError(
--- a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
@ -0,0 +1,164 @@
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 import numpy as np
 import math
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import random
 import sys
 sys.path.append("./rnn")
 from rnn_numpy import GRU
 from convert import get_params_for_net
 random.seed(2)
 np.set_printoptions(threshold=np.inf)
 paddle.enable_static()
 class TestGRUOp(OpTest):
    def get_weight_names(self):
        weight_names = []
        for i in range(self.num_layers):
            for j in range(0, 2 * self.direction_num):
                weight_names.append("{}.weight_{}".format(i, j))
        for i in range(self.num_layers):
            for j in range(0, 2 * self.direction_num):
                weight_names.append("{}.bias_{}".format(i, j))
        return weight_names
    def setUp(self):
        self.op_type = "rnn"
        self.dtype = "float64"
        self.sequence_length = np.array(
            [12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32)
        self.num_layers = 1
        self.is_bidirec = False
        self.is_test = False
        self.mode = "GRU"
        self.dropout = 0.
        seq_length = 12
        batch_size = 8
        input_size = 4
        self.hidden_size = 2
        self.set_attrs()
        self.direction_num = 2 if self.is_bidirec else 1
        direction = "bidirectional" if self.is_bidirec else "forward"
        input = np.random.uniform(
            low=-0.1, high=0.1,
            size=(seq_length, batch_size, input_size)).astype(self.dtype)
        if self.sequence_length is not None:
            input[3][1:][:] = 0
            input[4][2:][:] = 0
            input[2][3:][:] = 0
            input[1][4:][:] = 0
        rnn1 = GRU(input_size,
                   self.hidden_size,
                   num_layers=self.num_layers,
                   time_major=True,
                   direction=direction,
                   dropout=self.dropout,
                   dtype=self.dtype)
        flat_w = get_params_for_net(rnn1)
        output, last_hidden = rnn1(input, sequence_length=self.sequence_length)
        init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                           self.hidden_size)).astype(self.dtype)
        state_out = np.ndarray((300)).astype("uint8")
        self.inputs = {
            'Input': input,
            'WeightList': flat_w,
            'PreState': [('init_h', init_h)],
            'SequenceLength': self.sequence_length
        }
        if self.sequence_length is None:
            self.inputs = {
                'Input': input,
                'WeightList': flat_w,
                'PreState': [('init_h', init_h)],
            }
        self.attrs = {
            'dropout_prob': self.dropout,
            'is_bidirec': self.is_bidirec,
            'input_size': input_size,
            'hidden_size': self.hidden_size,
            'num_layers': self.num_layers,
            'is_test': self.is_test,
            'mode': self.mode
        }
        self.outputs = {
            'Out': output,
            'State': [('last_hidden', last_hidden)],
            'Reserve': np.ndarray((400)).astype("uint8"),
            'DropoutState': state_out
        }
    def set_attrs(self):
        pass
    def test_output(self):
        self.check_output(no_check_set=['Reserve', 'DropoutState'])
    def test_grad(self):
        if not self.is_test:
            var_name_list = self.get_weight_names()
            grad_check_list = ['Input', 'init_h']
            grad_check_list.extend(var_name_list)
            self.check_grad(set(grad_check_list), ['Out', 'last_hidden'])
 class TestGRUOp1(TestGRUOp):
    def set_attrs(self):
        self.sequence_length = None
 class TestGRUOp2(TestGRUOp):
    def set_attrs(self):
        self.sequence_length = None
        self.is_bidirec = True
 class TestGRUOp3(TestGRUOp):
    def set_attrs(self):
        self.sequence_length = None
        self.is_test = True
 class TestGRUOp4(TestGRUOp):
    def set_attrs(self):
        self.sequence_length = None
        self.is_bidirec = True
        self.is_test = True
 class TestGRUOpAvx(TestGRUOp):
    def set_attrs(self):
        self.dtype = "float32"
        self.hidden_size = 8
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@ -0,0 +1,159 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import unittest
 import numpy as np
 import math
 import paddle.fluid.core as core
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import random
 import sys
 from op_test import OpTest
 sys.path.append("./rnn")
 from rnn_numpy import SimpleRNN, LSTM, GRU
 from convert import get_params_for_net
 random.seed(2)
 np.set_printoptions(threshold=np.inf)
 paddle.enable_static()
 class TestRNNOp(OpTest):
    def get_weight_names(self):
        weight_names = []
        for i in range(self.num_layers):
            for j in range(0, 2 * self.direction_num):
                weight_names.append("{}.weight_{}".format(i, j))
        for i in range(self.num_layers):
            for j in range(0, 2 * self.direction_num):
                weight_names.append("{}.bias_{}".format(i, j))
        return weight_names
    def setUp(self):
        self.op_type = "rnn"
        self.dtype = np.float64
        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
        self.num_layers = 1
        self.is_bidirec = False
        self.mode = "LSTM"
        self.is_test = False
        self.set_attrs()
        self.direction_num = 2 if self.is_bidirec else 1
        direction = "bidirectional" if self.is_bidirec else "forward"
        seq_length = 12
        batch_size = 5
        input_size = 3
        hidden_size = 2
        input = np.random.uniform(
            low=-0.1, high=0.1,
            size=(seq_length, batch_size, input_size)).astype(self.dtype)
        if self.sequence_length is not None:
            input[11][1:][:] = 0
            input[10][2:][:] = 0
            input[9][3:][:] = 0
            input[8][4:][:] = 0
        rnn1 = LSTM(
            input_size,
            hidden_size,
            num_layers=self.num_layers,
            time_major=True,
            direction=direction)
        flat_w = get_params_for_net(rnn1)
        output, (last_hidden, last_cell) = rnn1(
            input, sequence_length=self.sequence_length)
        init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                           hidden_size)).astype(self.dtype)
        init_c = np.zeros((self.num_layers * self.direction_num, batch_size,
                           hidden_size)).astype(self.dtype)
        state_out = np.ndarray((300)).astype("uint8")
        self.inputs = {
            'Input': input,
            'WeightList': flat_w,
            'PreState': [('init_h', init_h), ('init_c', init_c)],
            'SequenceLength': self.sequence_length
        }
        if self.sequence_length is None:
            self.inputs = {
                'Input': input,
                'WeightList': flat_w,
                'PreState': [('init_h', init_h), ('init_c', init_c)],
            }
        self.attrs = {
            'dropout_prob': 0.0,
            'is_bidirec': self.is_bidirec,
            'input_size': input_size,
            'hidden_size': hidden_size,
            'num_layers': self.num_layers,
            'mode': self.mode,
            'is_test': self.is_test
        }
        self.outputs = {
            'Out': output,
            "State": [('last_hidden', last_hidden), ('last_cell', last_cell)],
            'Reserve': np.ndarray((400)).astype("uint8"),
            'DropoutState': state_out
        }
    def test_output(self):
        self.check_output(no_check_set=['Reserve', 'DropoutState'])
    def set_attrs(self):
        pass
    def test_grad(self):
        if not self.is_test:
            var_name_list = self.get_weight_names()
            grad_check_list = ['Input', 'init_h', 'init_c']
            grad_check_list.extend(var_name_list)
            self.check_grad(
                set(grad_check_list), ['Out', 'last_hidden', 'last_cell'])
 class TestRNNOp1(TestRNNOp):
    def set_attrs(self):
        self.sequence_length = None
 class TestRNNOp2(TestRNNOp):
    def set_attrs(self):
        self.sequence_length = None
        self.is_bidirec = True
 class TestRNNOp3(TestRNNOp):
    def set_attrs(self):
        self.is_test = True
        self.sequence_length = None
 class TestRNNOp4(TestRNNOp):
    def set_attrs(self):
        self.is_test = True
        self.sequence_length = None
        self.is_bidirec = True
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py
@ -0,0 +1,162 @@
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 import numpy as np
 import math
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import random
 import sys
 sys.path.append("./rnn")
 from rnn_numpy import SimpleRNN
 from convert import get_params_for_net
 random.seed(2)
 np.set_printoptions(threshold=np.inf)
 paddle.enable_static()
 class TestSimpleRNNOp(OpTest):
    def get_weight_names(self):
        weight_names = []
        for i in range(self.num_layers):
            for j in range(0, 2 * self.direction_num):
                weight_names.append("{}.weight_{}".format(i, j))
        for i in range(self.num_layers):
            for j in range(0, 2 * self.direction_num):
                weight_names.append("{}.bias_{}".format(i, j))
        return weight_names
    def setUp(self):
        self.op_type = "rnn"
        self.dtype = np.float64
        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
        self.num_layers = 1
        self.is_bidirec = False
        self.is_test = False
        self.mode = "RNN_TANH"
        self.dropout = 0.
        self.set_attrs()
        self.direction_num = 2 if self.is_bidirec else 1
        direction = "bidirectional" if self.is_bidirec else "forward"
        seq_length = 12
        batch_size = 5
        input_size = 3
        hidden_size = 2
        input = np.random.uniform(
            low=-0.1, high=0.1,
            size=(seq_length, batch_size, input_size)).astype(self.dtype)
        if self.sequence_length is not None:
            input[11][1:][:] = 0
            input[10][2:][:] = 0
            input[9][3:][:] = 0
            input[8][4:][:] = 0
        rnn1 = SimpleRNN(
            input_size,
            hidden_size,
            num_layers=self.num_layers,
            time_major=True,
            direction=direction,
            dropout=self.dropout,
            nonlinearity=self.mode)
        flat_w = get_params_for_net(rnn1)
        output, last_hidden = rnn1(input, sequence_length=self.sequence_length)
        init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                           hidden_size)).astype(self.dtype)
        state_out = np.ndarray((300)).astype("uint8")
        self.inputs = {
            'Input': input,
            'WeightList': flat_w,
            'PreState': [('init_h', init_h)],
            'SequenceLength': self.sequence_length
        }
        if self.sequence_length is None:
            self.inputs = {
                'Input': input,
                'WeightList': flat_w,
                'PreState': [('init_h', init_h)]
            }
        self.attrs = {
            'dropout_prob': self.dropout,
            'is_bidirec': self.is_bidirec,
            'input_size': input_size,
            'hidden_size': hidden_size,
            'num_layers': self.num_layers,
            'is_test': self.is_test,
            'mode': self.mode
        }
        self.outputs = {
            'Out': output,
            'State': [('last_hidden', last_hidden)],
            'Reserve': np.ndarray((400)).astype("uint8"),
            'DropoutState': state_out
        }
    def set_attrs(self):
        pass
    def test_output(self):
        self.check_output(no_check_set=['Reserve', 'DropoutState'])
    def test_grad(self):
        if not self.is_test:
            var_name_list = self.get_weight_names()
            grad_check_list = ['Input', 'init_h']
            grad_check_list.extend(var_name_list)
            self.check_grad(set(grad_check_list), ['Out', 'last_hidden'])
 class TestSimpleRNNOp1(TestSimpleRNNOp):
    def set_attrs(self):
        self.sequence_length = None
 class TestSimpleRNNOp2(TestSimpleRNNOp):
    def set_attrs(self):
        self.sequence_length = None
        self.is_bidirec = True
 class TestSimpleRNNOp3(TestSimpleRNNOp):
    def set_attrs(self):
        self.sequence_length = None
        self.is_test = True
 class TestSimpleRNNOp4(TestSimpleRNNOp):
    def set_attrs(self):
        self.sequence_length = None
        self.is_bidirec = True
        self.is_test = True
 class TestSimpleRNNOp5(TestSimpleRNNOp):
    def set_attrs(self):
        self.mode = "RNN_RELU"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
@ -27,4 +27,5 @@ NEED_TO_FIX_OP_LIST = [
    'tree_conv',
    'cvm',
    'cudnn_lstm',
    'rnn',
 ]
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@ -28,4 +28,5 @@ no_check_set_white_list = [
    'check_finite_and_unscale',
    'update_loss_scaling',
    'cudnn_lstm',
    'rnn',
 ]
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@ -43,7 +43,8 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
    'yolov3_loss', \
    'inverse', \
    'bilateral_slice',\
-    'cudnn_lstm'
+    'cudnn_lstm', \
    'rnn', \
 ]
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@ -985,8 +985,7 @@ class RNNBase(LayerList):
                "direction should be forward, backward or bidirectional, "
                "received direction = {}".format(direction))
-        self.could_use_cudnn = get_device().startswith(
+        self.could_use_cudnn = True
            "gpu:") and get_cudnn_version()
        self.could_use_cudnn &= direction != "backward"
        self.could_use_cudnn &= len(self.parameters()) == num_layers * 4 * (
            2 if direction == "bidirectional" else 1)