Add LSTM, Simple RNN and GRU CPU kernel (#28577)

* add lstm, simple rnn op kernel * fix the test_lstm for the rnn op * change func name * fix forward postprocess bug * add gru forward, backward code * remove unittest.skipIf; use a big rnn op instead of combination op * fix input doesn't have gradient bug * add eigen lstm forward, backward Co-authored-by: wawltor <fangzeyang0904@hotmail.com>
5 years ago · 9362d85e0e
parent 30ef3815b3
commit 9362d85e0e
24 changed files with 3376 additions and 175 deletions
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@ -30,18 +30,24 @@ namespace detail {

 enum ActivationType {
  kSigmoid,
+  KSigmoidV2,
  kReLU,
  kTanh,
+  kTanhV2,
  kIdentity,
 };

 inline ActivationType GetActivationType(const std::string &type) {
  if (type == "sigmoid") {
    return ActivationType::kSigmoid;
+  } else if (type == "sigmoid_v2") {
+    return ActivationType::KSigmoidV2;
  } else if (type == "relu") {
    return ActivationType::kReLU;
  } else if (type == "tanh") {
    return ActivationType::kTanh;
+  } else if (type == "tanh_v2") {
+    return ActivationType::kTanhV2;
  } else if (type == "identity" || type == "") {
    return ActivationType::kIdentity;
  }
@ -68,6 +74,14 @@ DEVICE T Sigmoid(const T a) {
  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
 }

+/*
+ * Don't limit input in a threshold range.
+ */
+template <typename T>
+DEVICE T SigmoidV2(const T a) {
+  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-a));
+}
+
 template <typename T>
 DEVICE T Tanh(const T a) {
  T tmp = -2.0 * a;
@ -75,6 +89,15 @@ DEVICE T Tanh(const T a) {
  return (2.0 / (1.0 + exp(tmp))) - 1.0;
 }

+/*
+ * Don't limit input in a threshold range.
+ */
+template <typename T>
+DEVICE T TanhV2(const T a) {
+  T tmp = -2.0 * a;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
 }  // namespace forward

 namespace backward {
@ -108,20 +131,24 @@ struct Active {
 };

 static DEVICE Active<float>::Act kActFloat[] = {
-    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
-    &forward::Identity<float>};
+    &forward::Sigmoid<float>, &forward::SigmoidV2<float>,
+    &forward::Relu<float>,    &forward::Tanh<float>,
+    &forward::TanhV2<float>,  &forward::Identity<float>};

 static DEVICE Active<float>::ActGrad kActGradFloat[] = {
-    &backward::Sigmoid<float>, &backward::Relu<float>, &backward::Tanh<float>,
-    &backward::Identity<float>};
+    &backward::Sigmoid<float>, &backward::Sigmoid<float>,
+    &backward::Relu<float>,    &backward::Tanh<float>,
+    &backward::Tanh<float>,    &backward::Identity<float>};

 static DEVICE Active<double>::Act kActDouble[] = {
-    &forward::Sigmoid<double>, &forward::Relu<double>, &forward::Tanh<double>,
-    &forward::Identity<double>};
+    &forward::Sigmoid<double>, &forward::SigmoidV2<double>,
+    &forward::Relu<double>,    &forward::Tanh<double>,
+    &forward::TanhV2<double>,  &forward::Identity<double>};

 static DEVICE Active<double>::ActGrad kActGradDouble[] = {
-    &backward::Sigmoid<double>, &backward::Relu<double>,
-    &backward::Tanh<double>, &backward::Identity<double>};
+    &backward::Sigmoid<double>, &backward::Sigmoid<double>,
+    &backward::Relu<double>,    &backward::Tanh<double>,
+    &backward::Tanh<double>,    &backward::Identity<double>};

 namespace forward {
 inline DEVICE float activation(float a, int index) {
@ -149,7 +176,9 @@ namespace forward {
 namespace avx {
 __m256 Relu(const __m256 a);
 __m256 Sigmoid(const __m256 a);
+__m256 SigmoidV2(const __m256 a);
 __m256 Tanh(const __m256 a);
+__m256 TanhV2(const __m256 a);
 __m256 Identity(const __m256 a);
 }  // namespace avx
 }  // namespace forward
@ -164,12 +193,12 @@ __m256 Identity(const __m256 a, const __m256 b);
 }  // namespace backward

 static Active<__m256>::Act kActAvx[] = {
-    &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh,
-    &forward::avx::Identity};
+    &forward::avx::Sigmoid, &forward::avx::SigmoidV2, &forward::avx::Relu,
+    &forward::avx::Tanh,    &forward::avx::TanhV2,    &forward::avx::Identity};

 static Active<__m256>::ActGrad kActGradAvx[] = {
-    &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh,
-    &backward::avx::Identity};
+    &backward::avx::Sigmoid, &backward::avx::Sigmoid, &backward::avx::Relu,
+    &backward::avx::Tanh,    &backward::avx::Tanh,    &backward::avx::Identity};

 namespace forward {
 inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@ -43,6 +43,13 @@ __m256 Sigmoid(const __m256 a) {
  return tmp;
 }

+__m256 SigmoidV2(const __m256 a) {
+  __m256 tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), a);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), exp256_ps(tmp));
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
+
 __m256 Tanh(const __m256 a) {
  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
@ -53,6 +60,14 @@ __m256 Tanh(const __m256 a) {
                       _mm256_set1_ps(1.0f));
 }

+__m256 TanhV2(const __m256 a) {
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  return _mm256_sub_ps(
+      _mm256_div_ps(_mm256_set1_ps(2.0f),
+                    _mm256_add_ps(_mm256_set1_ps(1.0f), exp256_ps(tmp))),
+      _mm256_set1_ps(1.0f));
+}
+
 __m256 Identity(const __m256 a) { return a; }

 }  // namespace avx
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@ -31,8 +31,8 @@ namespace detail {
 template <class OpResetOutput, bool is_batch, typename T>
 __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
                                        T *gate_value, T *reset_output_value,
-                                        T *prev_output_value, int frame_size,
-                                        int batch_size,
+                                        const T *prev_output_value,
+                                        int frame_size, int batch_size,
                                        ActivationType active_gate) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;
@ -68,12 +68,10 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
 * grid(frame_blocks, batch_blocks)
 */
 template <class OpFinalOutput, bool is_batch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
-                                        T *gate_value, T *prev_output_value,
-                                        T *output_value, int frame_size,
-                                        int batch_size,
-                                        ActivationType active_node,
-                                        bool origin_mode) {
+__global__ void KeGruForwardFinalOutput(
+    OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value,
+    T *output_value, int frame_size, int batch_size, ActivationType active_node,
+    bool origin_mode) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;
  int batch_idx = 0;
@ -106,8 +104,9 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
 * grid(frame_blocks, 1)
 */
 template <class T, int Tiled_size>
-__global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
-                                        T *gate_weight, T *reset_output,
+__global__ void KeFastCollectiveGruGate(T *gate_value,
+                                        const T *prev_output_value,
+                                        const T *gate_weight, T *reset_output,
                                        int frame_size,
                                        ActivationType active_node) {
  T xt_0 = 0.0f;
@ -164,10 +163,10 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
 * grid(frame_blocks, 1)
 */
 template <class T, int Tiled_size>
-__global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
-                                       T *output_value, T *gate_value,
-                                       T *reset_value, int frame_size,
-                                       ActivationType act_node,
+__global__ void KeFastCollectiveGruOut(const T *gate_weight,
+                                       const T *prev_out_value, T *output_value,
+                                       T *gate_value, T *reset_value,
+                                       int frame_size, ActivationType act_node,
                                       bool origin_mode) {
  int COL = blockIdx.x * blockDim.x + threadIdx.x;

@ -223,7 +222,7 @@ __global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
 */
 template <class OpStateGrad, bool is_batch, typename T>
 __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
+                                       T *gate_grad, const T *prev_out_value,
                                       T *prev_out_grad, T *output_grad,
                                       int frame_size, int batch_size,
                                       ActivationType active_node,
@ -272,7 +271,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
 */
 template <class OpResetGrad, bool is_batch, typename T>
 __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
+                                       T *gate_grad, const T *prev_out_value,
                                       T *prev_out_grad, T *reset_output_grad,
                                       int frame_size, int batch_size,
                                       ActivationType active_gate) {
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@ -30,10 +30,17 @@ class gru_resetOutput {
 public:
  HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
                             T *prev_out, T *value_reset_output,
-                             ActivationType act_gate) {
+                             ActivationType act_gate,
+                             T *value_reset_bias = nullptr,
+                             bool old_version = true) {
    *value_update_gate = activation(*value_update_gate, act_gate);
    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = (*prev_out) * (*value_reset_gate);
+    if (old_version) {
+      *value_reset_output = (*prev_out) * (*value_reset_gate);
+    } else {
+      *value_reset_output =
+          (*value_reset_output + *value_reset_bias) * (*value_reset_gate);
+    }
  }
 #ifndef __NVCC__
 #ifndef __AVX__
@ -43,10 +50,19 @@ class gru_resetOutput {
  HOSTDEVICE void operator()(__m256 *value_update_gate,
                             __m256 *value_reset_gate, __m256 *prev_out,
                             __m256 *value_reset_output,
-                             ActivationType act_gate) {
+                             ActivationType act_gate,
+                             __m256 *value_reset_bias = nullptr,
+                             bool old_version = true) {
    *value_update_gate = activation(*value_update_gate, act_gate);
    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
+    if (old_version) {
+      *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
+    } else {
+      *value_reset_output =
+          _mm256_add_ps(*value_reset_output, *value_reset_bias);
+      *value_reset_output =
+          _mm256_mul_ps(*value_reset_output, *value_reset_gate);
+    }
  }
 #endif
 #endif
@ -192,6 +208,61 @@ class gru_resetGrad {
 #endif
 #endif
 };
+template <typename T>
+class gru {
+ public:
+  HOSTDEVICE void operator()(T *value_reset_gate, T *grad_reset_gate,
+                             T *value_update_gate, T *grad_update_gate,
+                             T *value_frame_state, T *grad_frame_state,
+                             T *value_prev_out, T *grad_prev_out,
+                             T *grad_output, T *value_reset_output,
+                             T *grad_reset_output, ActivationType act_node,
+                             ActivationType act_gate) {
+    *grad_update_gate =
+        activation((*grad_output) * ((*value_prev_out) - (*value_frame_state)),
+                   (*value_update_gate), act_gate);
+    *grad_prev_out += (*grad_output * (*value_update_gate));
+    *grad_frame_state =
+        activation(*grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
+                   *value_frame_state, act_node);
+    T reset_output = (*value_reset_output) / (*value_reset_gate);
+    *grad_reset_gate = activation(reset_output * (*grad_frame_state),
+                                  *value_reset_gate, act_gate);
+    *grad_reset_output = (*value_reset_gate) * (*grad_frame_state);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 *value_reset_gate, __m256 *grad_reset_gate,
+                             __m256 *value_update_gate,
+                             __m256 *grad_update_gate,
+                             __m256 *value_frame_state,
+                             __m256 *grad_frame_state, __m256 *value_prev_out,
+                             __m256 *grad_prev_out, __m256 *grad_output,
+                             __m256 *value_reset_output,
+                             __m256 *grad_reset_output, ActivationType act_node,
+                             ActivationType act_gate) {
+    *grad_update_gate = activation(
+        _mm256_mul_ps(*grad_output,
+                      _mm256_sub_ps(*value_prev_out, *value_frame_state)),
+        *value_update_gate, act_gate);
+    *grad_prev_out = _mm256_add_ps(
+        *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
+    *grad_frame_state = activation(
+        _mm256_mul_ps(*grad_output,
+                      _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)),
+        *value_frame_state, act_node);
+    __m256 reset_output = _mm256_div_ps(*value_reset_output, *value_reset_gate);
+    *grad_reset_gate =
+        activation(_mm256_mul_ps(reset_output, *grad_frame_state),
+                   *value_reset_gate, act_gate);
+    *grad_reset_output = _mm256_mul_ps(*value_reset_gate, *grad_frame_state);
+  }
+#endif
+#endif
+};

 }  // namespace backward

--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@ -11,6 +11,7 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/gru_compute.h"

+#include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
@ -101,11 +102,64 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
  }
 };

+template <typename T>
+struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+#ifndef __NVCC__
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    if (value.prev_out_value) {
+      blas.GEMM(CblasNoTrans, CblasTrans, batch_size, frame_size, frame_size, 1,
+                value.prev_out_value, value.state_weight, 0,
+                value.reset_output_value);
+    }
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
+                                 frame_size, batch_size, active_gate, false);
+
+    T *cell_state_value = value.gate_value + 2 * frame_size;
+    T *reset_output_value = value.reset_output_value;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(frame_size, cell_state_value, reset_output_value,
+                cell_state_value);
+      cell_state_value += frame_size * 3;
+      reset_output_value += frame_size;
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
+                                 frame_size, batch_size, active_node, true,
+                                 false);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+#ifndef __NVCC__
+    // calculate grad_update_gate, grad_frame_state,
+    // grad_reset_output, grad_reset_gate
+    detail::cpu_gru_backward(detail::backward::gru<T>(), value, grad,
+                             frame_size, batch_size, active_node, active_gate);
+#endif
+  }
+};
+
 template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
 template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
 template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
 template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;

+template struct GRUUnitFunctorV2<platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctorV2<platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
@ -21,12 +21,13 @@ namespace math {

 template <typename T>
 struct GRUMetaValue {
-  T *gate_weight;
-  T *state_weight;
+  const T *gate_weight;
+  const T *state_weight;
+  const T *reset_bias;
  T *gate_value;
  T *reset_output_value;
  T *output_value;
-  T *prev_out_value;
+  const T *prev_out_value;
 };

 template <typename T>
@ -37,6 +38,7 @@ struct GRUMetaGrad {
  T *reset_output_grad;
  T *output_grad;
  T *prev_out_grad;
+  T *state_bias_grad;
 };

 template <typename DeviceContext, typename T>
@ -57,6 +59,22 @@ struct GRUUnitGradFunctor {
                      bool origin_mode);
 };

+template <typename DeviceContext, typename T>
+struct GRUUnitFunctorV2 {
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitGradFunctorV2 {
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@ -33,10 +33,12 @@ struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
                      LstmMetaValue<T> value, int frame_size, int batch_size,
                      T cell_clip, const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               cell_clip, cand_act, gate_act, cell_act);
+      detail::cpu_lstm_forward(context, detail::forward::lstm<T>(), value,
+                               frame_size, cell_clip, cand_act, gate_act,
+                               cell_act, old_api_version);
      value.gate_value += frame_size * 4;
      value.state_value += frame_size;
      value.state_active_value += frame_size;
@ -55,11 +57,12 @@ struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, cell_clip, cand_act, gate_act,
-                                cell_act);
+      detail::cpu_lstm_backward(context, detail::backward::lstm<T>(), value,
+                                grad, frame_size, cell_clip, cand_act, gate_act,
+                                cell_act, old_api_version);

      value.gate_value += frame_size * 4;
      value.state_value += frame_size;
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@ -26,7 +26,8 @@ struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
                      LstmMetaValue<T> value, int frame_size, int batch_size,
                      T cell_clip, const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
                                frame_size, batch_size, cell_clip, cand_act,
                                gate_act, cell_act);
@ -40,7 +41,8 @@ struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
+                      const detail::ActivationType& cand_act,
+                      bool old_api_version = true) {
    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
                              frame_size, batch_size, cell_clip, cand_act,
                              gate_act, cell_act);
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
@ -25,7 +25,7 @@ namespace math {
 template <class T>
 struct LstmMetaValue {
  T *gate_value;
-  T *prev_state_value;
+  const T *prev_state_value;
  T *state_value;
  T *state_active_value;
  T *output_value;
@ -53,7 +53,8 @@ class LstmUnitFunctor {
                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType &gate_act,
                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
+                      const detail::ActivationType &cand_act,
+                      bool old_api_version = true);
 };

 template <typename DeviceContext, typename T>
@ -63,7 +64,8 @@ class LstmUnitGradFunctor {
                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
                      T cell_clip, const detail::ActivationType &gate_act,
                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
+                      const detail::ActivationType &cand_act,
+                      bool old_api_version = true);
 };

 }  // namespace math
--- a/paddle/fluid/operators/rnn_op.cc
+++ b/paddle/fluid/operators/rnn_op.cc
@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/operators/rnn_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
@ -251,5 +252,10 @@ REGISTER_OPERATOR(rnn, ops::RNNOp, ops::RNNOpMaker,
                  ops::RNNGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(rnn_grad, ops::RNNGradOp);

-REGISTER_OP_CPU_KERNEL(rnn, ops::NotImpleKernel<float>);
-REGISTER_OP_CPU_KERNEL(rnn_grad, ops::NotImpleKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    rnn, ops::RNNCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::RNNCPUKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    rnn_grad, ops::RNNCPUGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::RNNCPUGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@ -524,6 +524,12 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
      offset += len;
    }

+    Tensor input_grad_value;
+    if (!in_grad) {
+      in_grad = &input_grad_value;
+      in_grad->Resize(input->dims());
+    }
+
    auto *init_h_data = pre_state[0]->data<T>();
    // auto *last_h_data = state[0]->data<T>();
    auto *last_h_grad_data = state_grad[0]->data<T>();
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
@ -65,10 +65,18 @@ class TestLstm(unittest.TestCase):
        paddle.jit.ProgramTranslator().enable(True)
        net = Net(12, 2)
        x = paddle.randn((2, 10, 12))
+        x.stop_gradient = False
        dygraph_out = net(x)
+        loss = paddle.mean(dygraph_out)
+        sgd = paddle.optimizer.SGD(learning_rate=0.001,
+                                   parameters=net.parameters())
+        loss.backward()
+        sgd.step()
        # switch eval mode firstly
        net.eval()
-
+        x = paddle.randn((2, 10, 12))
+        dygraph_out = net(x)
+        dropout_out = net(x)
        net = paddle.jit.to_static(
            net, input_spec=[paddle.static.InputSpec(shape=[-1, 10, 12])])
        paddle.jit.save(net, 'simple_lstm')
@ -106,6 +114,14 @@ class TestSaveInEvalMode(unittest.TestCase):
    def test_save_in_eval(self):
        paddle.jit.ProgramTranslator().enable(True)
        net = LinearNet()
+        x = paddle.randn((2, 10))
+        x.stop_gradient = False
+        dygraph_out = net(x)
+        loss = paddle.mean(dygraph_out)
+        sgd = paddle.optimizer.SGD(learning_rate=0.001,
+                                   parameters=net.parameters())
+        loss.backward()
+        sgd.step()
        # switch eval mode firstly
        net.eval()
        # save directly
@ -129,6 +145,14 @@ class TestEvalAfterSave(unittest.TestCase):
    def test_eval_after_save(self):
        x = paddle.randn((2, 10, 12)).astype('float32')
        net = Net(12, 2)
+        x.stop_gradient = False
+        dy_out = net(x)
+        loss = paddle.mean(dy_out)
+        sgd = paddle.optimizer.SGD(learning_rate=0.001,
+                                   parameters=net.parameters())
+        loss.backward()
+        sgd.step()
+        x = paddle.randn((2, 10, 12)).astype('float32')
        dy_out = net(x)
        # save model
        paddle.jit.save(net, 'jit.save/lstm', input_spec=[x])
--- a/python/paddle/fluid/tests/unittests/rnn/convert.py
+++ b/python/paddle/fluid/tests/unittests/rnn/convert.py
@ -49,3 +49,34 @@ def convert_params_for_net_static(np_net, paddle_net, place):
                                           paddle_layer.cell_fw, place)
            convert_params_for_cell_static(np_layer.cell_bw,
                                           paddle_layer.cell_bw, place)
+
+
+def get_params_for_cell(np_cell, num_layers, idx):
+    state = np_cell.parameters
+    weight_list = [
+        ('{}.weight_{}'.format(num_layers, idx), state['weight_ih']),
+        ('{}.weight_{}'.format(num_layers, idx + 1), state['weight_hh'])
+    ]
+    bias_list = [('{}.bias_{}'.format(num_layers, idx), state['bias_ih']),
+                 ('{}.bias_{}'.format(num_layers, idx + 1), state['bias_hh'])]
+    return weight_list, bias_list
+
+
+def get_params_for_net(np_net):
+    weight_list = []
+    bias_list = []
+    for layer_idx, np_layer in enumerate(np_net):
+        if hasattr(np_layer, "cell"):
+            weight, bias = get_params_for_cell(np_layer.cell, layer_idx, 0)
+            for w, b in zip(weight, bias):
+                weight_list.append(w)
+                bias_list.append(b)
+        else:
+            for count, cell in enumerate([np_layer.cell_fw, np_layer.cell_bw]):
+                weight, bias = get_params_for_cell(cell, layer_idx, count * 2)
+                for w, b in zip(weight, bias):
+                    weight_list.append(w)
+                    bias_list.append(b)
+
+    weight_list.extend(bias_list)
+    return weight_list
--- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@ -33,11 +33,16 @@ class LayerListMixin(LayerMixin):


 class SimpleRNNCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 bias=True,
+                 nonlinearity="RNN_TANH",
+                 dtype="float64"):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
-        if nonlinearity == 'tanh':
+        if nonlinearity == 'RNN_TANH':
            self.nonlinearity = np.tanh
        else:
            self.nonlinearity = lambda x: np.maximum(x, 0.)
@ -45,16 +50,16 @@ class SimpleRNNCell(LayerMixin):
        self.parameters = dict()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_ih = np.random.uniform(-std, std, (
-            hidden_size, input_size)).astype('float64')
+            hidden_size, input_size)).astype(dtype)
        self.weight_hh = np.random.uniform(-std, std, (
-            hidden_size, hidden_size)).astype('float64')
+            hidden_size, hidden_size)).astype(dtype)
        self.parameters['weight_ih'] = self.weight_ih
        self.parameters['weight_hh'] = self.weight_hh
        if bias:
            self.bias_ih = np.random.uniform(-std, std,
-                                             (hidden_size, )).astype('float64')
+                                             (hidden_size, )).astype(dtype)
            self.bias_hh = np.random.uniform(-std, std,
-                                             (hidden_size, )).astype('float64')
+                                             (hidden_size, )).astype(dtype)
            self.parameters['bias_ih'] = self.bias_ih
            self.parameters['bias_hh'] = self.bias_hh
        else:
@ -80,23 +85,23 @@ class SimpleRNNCell(LayerMixin):


 class GRUCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True):
+    def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.parameters = dict()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_ih = np.random.uniform(-std, std, (
-            3 * hidden_size, input_size)).astype('float64')
+            3 * hidden_size, input_size)).astype(dtype)
        self.weight_hh = np.random.uniform(-std, std, (
-            3 * hidden_size, hidden_size)).astype('float64')
+            3 * hidden_size, hidden_size)).astype(dtype)
        self.parameters['weight_ih'] = self.weight_ih
        self.parameters['weight_hh'] = self.weight_hh
        if bias:
-            self.bias_ih = np.random.uniform(-std, std, (
-                3 * hidden_size)).astype('float64')
-            self.bias_hh = np.random.uniform(-std, std, (
-                3 * hidden_size)).astype('float64')
+            self.bias_ih = np.random.uniform(-std, std,
+                                             (3 * hidden_size)).astype(dtype)
+            self.bias_hh = np.random.uniform(-std, std,
+                                             (3 * hidden_size)).astype(dtype)
            self.parameters['bias_ih'] = self.bias_ih
            self.parameters['bias_hh'] = self.bias_hh
        else:
@ -128,23 +133,23 @@ class GRUCell(LayerMixin):


 class LSTMCell(LayerMixin):
-    def __init__(self, input_size, hidden_size, bias=True):
+    def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.parameters = dict()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_ih = np.random.uniform(-std, std, (
-            4 * hidden_size, input_size)).astype('float64')
+            4 * hidden_size, input_size)).astype(dtype)
        self.weight_hh = np.random.uniform(-std, std, (
-            4 * hidden_size, hidden_size)).astype('float64')
+            4 * hidden_size, hidden_size)).astype(dtype)
        self.parameters['weight_ih'] = self.weight_ih
        self.parameters['weight_hh'] = self.weight_hh
        if bias:
-            self.bias_ih = np.random.uniform(-std, std, (
-                4 * hidden_size)).astype('float64')
-            self.bias_hh = np.random.uniform(-std, std, (
-                4 * hidden_size)).astype('float64')
+            self.bias_ih = np.random.uniform(-std, std,
+                                             (4 * hidden_size)).astype(dtype)
+            self.bias_hh = np.random.uniform(-std, std,
+                                             (4 * hidden_size)).astype(dtype)
            self.parameters['bias_ih'] = self.bias_ih
            self.parameters['bias_hh'] = self.bias_hh
        else:
@ -403,28 +408,36 @@ class SimpleRNN(RNNMixin):
                 input_size,
                 hidden_size,
                 num_layers=1,
-                 nonlinearity="tanh",
+                 nonlinearity="RNN_TANH",
                 direction="forward",
                 dropout=0.,
-                 time_major=False):
+                 time_major=False,
+                 dtype="float64"):
        super(SimpleRNN, self).__init__()

        if direction in ["forward", "backward"]:
            is_reverse = direction == "backward"
-            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell = SimpleRNNCell(
+                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
            self.append(RNN(cell, is_reverse, time_major))
            for i in range(1, num_layers):
-                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
+                cell = SimpleRNNCell(
+                    hidden_size,
+                    hidden_size,
+                    nonlinearity=nonlinearity,
+                    dtype=dtype)
                self.append(RNN(cell, is_reverse, time_major))
        elif direction == "bidirectional":
-            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
-            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell_fw = SimpleRNNCell(
+                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
+            cell_bw = SimpleRNNCell(
+                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
            self.append(BiRNN(cell_fw, cell_bw, time_major))
            for i in range(1, num_layers):
-                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
-                                        nonlinearity)
-                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
-                                        nonlinearity)
+                cell_fw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, nonlinearity, dtype=dtype)
+                cell_bw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, nonlinearity, dtype=dtype)
                self.append(BiRNN(cell_fw, cell_bw, time_major))
        else:
            raise ValueError(
@ -447,23 +460,24 @@ class LSTM(RNNMixin):
                 num_layers=1,
                 direction="forward",
                 dropout=0.,
-                 time_major=False):
+                 time_major=False,
+                 dtype="float64"):
        super(LSTM, self).__init__()

        if direction in ["forward", "backward"]:
            is_reverse = direction == "backward"
-            cell = LSTMCell(input_size, hidden_size)
+            cell = LSTMCell(input_size, hidden_size, dtype=dtype)
            self.append(RNN(cell, is_reverse, time_major))
            for i in range(1, num_layers):
-                cell = LSTMCell(hidden_size, hidden_size)
+                cell = LSTMCell(hidden_size, hidden_size, dtype=dtype)
                self.append(RNN(cell, is_reverse, time_major))
        elif direction == "bidirectional":
-            cell_fw = LSTMCell(input_size, hidden_size)
-            cell_bw = LSTMCell(input_size, hidden_size)
+            cell_fw = LSTMCell(input_size, hidden_size, dtype=dtype)
+            cell_bw = LSTMCell(input_size, hidden_size, dtype=dtype)
            self.append(BiRNN(cell_fw, cell_bw, time_major))
            for i in range(1, num_layers):
-                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
-                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size, dtype=dtype)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size, dtype=dtype)
                self.append(BiRNN(cell_fw, cell_bw, time_major))
        else:
            raise ValueError(
@ -486,23 +500,24 @@ class GRU(RNNMixin):
                 num_layers=1,
                 direction="forward",
                 dropout=0.,
-                 time_major=False):
+                 time_major=False,
+                 dtype="float64"):
        super(GRU, self).__init__()

        if direction in ["forward", "backward"]:
            is_reverse = direction == "backward"
-            cell = GRUCell(input_size, hidden_size)
+            cell = GRUCell(input_size, hidden_size, dtype=dtype)
            self.append(RNN(cell, is_reverse, time_major))
            for i in range(1, num_layers):
-                cell = GRUCell(hidden_size, hidden_size)
+                cell = GRUCell(hidden_size, hidden_size, dtype=dtype)
                self.append(RNN(cell, is_reverse, time_major))
        elif direction == "bidirectional":
-            cell_fw = GRUCell(input_size, hidden_size)
-            cell_bw = GRUCell(input_size, hidden_size)
+            cell_fw = GRUCell(input_size, hidden_size, dtype=dtype)
+            cell_bw = GRUCell(input_size, hidden_size, dtype=dtype)
            self.append(BiRNN(cell_fw, cell_bw, time_major))
            for i in range(1, num_layers):
-                cell_fw = GRUCell(2 * hidden_size, hidden_size)
-                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                cell_fw = GRUCell(2 * hidden_size, hidden_size, dtype=dtype)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size, dtype=dtype)
                self.append(BiRNN(cell_fw, cell_bw, time_major))
        else:
            raise ValueError(
--- a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
@ -0,0 +1,164 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+sys.path.append("./rnn")
+from rnn_numpy import GRU
+from convert import get_params_for_net
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestGRUOp(OpTest):
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.op_type = "rnn"
+        self.dtype = "float64"
+        self.sequence_length = np.array(
+            [12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32)
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.is_test = False
+        self.mode = "GRU"
+        self.dropout = 0.
+        seq_length = 12
+        batch_size = 8
+        input_size = 4
+        self.hidden_size = 2
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+
+        input = np.random.uniform(
+            low=-0.1, high=0.1,
+            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+
+        if self.sequence_length is not None:
+            input[3][1:][:] = 0
+            input[4][2:][:] = 0
+            input[2][3:][:] = 0
+            input[1][4:][:] = 0
+
+        rnn1 = GRU(input_size,
+                   self.hidden_size,
+                   num_layers=self.num_layers,
+                   time_major=True,
+                   direction=direction,
+                   dropout=self.dropout,
+                   dtype=self.dtype)
+
+        flat_w = get_params_for_net(rnn1)
+
+        output, last_hidden = rnn1(input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
+                           self.hidden_size)).astype(self.dtype)
+
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h)],
+            }
+        self.attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.is_bidirec,
+            'input_size': input_size,
+            'hidden_size': self.hidden_size,
+            'num_layers': self.num_layers,
+            'is_test': self.is_test,
+            'mode': self.mode
+        }
+        self.outputs = {
+            'Out': output,
+            'State': [('last_hidden', last_hidden)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_output(self):
+        self.check_output(no_check_set=['Reserve', 'DropoutState'])
+
+    def test_grad(self):
+        if not self.is_test:
+            var_name_list = self.get_weight_names()
+            grad_check_list = ['Input', 'init_h']
+            grad_check_list.extend(var_name_list)
+            self.check_grad(set(grad_check_list), ['Out', 'last_hidden'])
+
+
+class TestGRUOp1(TestGRUOp):
+    def set_attrs(self):
+        self.sequence_length = None
+
+
+class TestGRUOp2(TestGRUOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_bidirec = True
+
+
+class TestGRUOp3(TestGRUOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_test = True
+
+
+class TestGRUOp4(TestGRUOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_bidirec = True
+        self.is_test = True
+
+
+class TestGRUOpAvx(TestGRUOp):
+    def set_attrs(self):
+        self.dtype = "float32"
+        self.hidden_size = 8
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@ -0,0 +1,159 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+
+from op_test import OpTest
+sys.path.append("./rnn")
+from rnn_numpy import SimpleRNN, LSTM, GRU
+from convert import get_params_for_net
+
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestRNNOp(OpTest):
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.op_type = "rnn"
+        self.dtype = np.float64
+        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.mode = "LSTM"
+        self.is_test = False
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+        seq_length = 12
+        batch_size = 5
+        input_size = 3
+        hidden_size = 2
+
+        input = np.random.uniform(
+            low=-0.1, high=0.1,
+            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        if self.sequence_length is not None:
+            input[11][1:][:] = 0
+            input[10][2:][:] = 0
+            input[9][3:][:] = 0
+            input[8][4:][:] = 0
+
+        rnn1 = LSTM(
+            input_size,
+            hidden_size,
+            num_layers=self.num_layers,
+            time_major=True,
+            direction=direction)
+
+        flat_w = get_params_for_net(rnn1)
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
+                           hidden_size)).astype(self.dtype)
+        init_c = np.zeros((self.num_layers * self.direction_num, batch_size,
+                           hidden_size)).astype(self.dtype)
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h), ('init_c', init_c)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h), ('init_c', init_c)],
+            }
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'is_bidirec': self.is_bidirec,
+            'input_size': input_size,
+            'hidden_size': hidden_size,
+            'num_layers': self.num_layers,
+            'mode': self.mode,
+            'is_test': self.is_test
+        }
+        self.outputs = {
+            'Out': output,
+            "State": [('last_hidden', last_hidden), ('last_cell', last_cell)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def test_output(self):
+        self.check_output(no_check_set=['Reserve', 'DropoutState'])
+
+    def set_attrs(self):
+        pass
+
+    def test_grad(self):
+        if not self.is_test:
+            var_name_list = self.get_weight_names()
+            grad_check_list = ['Input', 'init_h', 'init_c']
+            grad_check_list.extend(var_name_list)
+            self.check_grad(
+                set(grad_check_list), ['Out', 'last_hidden', 'last_cell'])
+
+
+class TestRNNOp1(TestRNNOp):
+    def set_attrs(self):
+        self.sequence_length = None
+
+
+class TestRNNOp2(TestRNNOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_bidirec = True
+
+
+class TestRNNOp3(TestRNNOp):
+    def set_attrs(self):
+        self.is_test = True
+        self.sequence_length = None
+
+
+class TestRNNOp4(TestRNNOp):
+    def set_attrs(self):
+        self.is_test = True
+        self.sequence_length = None
+        self.is_bidirec = True
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py
@ -0,0 +1,162 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+sys.path.append("./rnn")
+from rnn_numpy import SimpleRNN
+from convert import get_params_for_net
+
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestSimpleRNNOp(OpTest):
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.op_type = "rnn"
+        self.dtype = np.float64
+        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.is_test = False
+        self.mode = "RNN_TANH"
+        self.dropout = 0.
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+        seq_length = 12
+        batch_size = 5
+        input_size = 3
+        hidden_size = 2
+
+        input = np.random.uniform(
+            low=-0.1, high=0.1,
+            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        if self.sequence_length is not None:
+            input[11][1:][:] = 0
+            input[10][2:][:] = 0
+            input[9][3:][:] = 0
+            input[8][4:][:] = 0
+
+        rnn1 = SimpleRNN(
+            input_size,
+            hidden_size,
+            num_layers=self.num_layers,
+            time_major=True,
+            direction=direction,
+            dropout=self.dropout,
+            nonlinearity=self.mode)
+
+        flat_w = get_params_for_net(rnn1)
+
+        output, last_hidden = rnn1(input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
+                           hidden_size)).astype(self.dtype)
+
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h)]
+            }
+        self.attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.is_bidirec,
+            'input_size': input_size,
+            'hidden_size': hidden_size,
+            'num_layers': self.num_layers,
+            'is_test': self.is_test,
+            'mode': self.mode
+        }
+        self.outputs = {
+            'Out': output,
+            'State': [('last_hidden', last_hidden)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_output(self):
+        self.check_output(no_check_set=['Reserve', 'DropoutState'])
+
+    def test_grad(self):
+        if not self.is_test:
+            var_name_list = self.get_weight_names()
+            grad_check_list = ['Input', 'init_h']
+            grad_check_list.extend(var_name_list)
+            self.check_grad(set(grad_check_list), ['Out', 'last_hidden'])
+
+
+class TestSimpleRNNOp1(TestSimpleRNNOp):
+    def set_attrs(self):
+        self.sequence_length = None
+
+
+class TestSimpleRNNOp2(TestSimpleRNNOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_bidirec = True
+
+
+class TestSimpleRNNOp3(TestSimpleRNNOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_test = True
+
+
+class TestSimpleRNNOp4(TestSimpleRNNOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_bidirec = True
+        self.is_test = True
+
+
+class TestSimpleRNNOp5(TestSimpleRNNOp):
+    def set_attrs(self):
+        self.mode = "RNN_RELU"
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
@ -27,4 +27,5 @@ NEED_TO_FIX_OP_LIST = [
    'tree_conv',
    'cvm',
    'cudnn_lstm',
+    'rnn',
 ]
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@ -28,4 +28,5 @@ no_check_set_white_list = [
    'check_finite_and_unscale',
    'update_loss_scaling',
    'cudnn_lstm',
+    'rnn',
 ]
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@ -43,7 +43,8 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
    'yolov3_loss', \
    'inverse', \
    'bilateral_slice',\
-    'cudnn_lstm'
+    'cudnn_lstm', \
+    'rnn', \
 ]

 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@ -985,8 +985,7 @@ class RNNBase(LayerList):
                "direction should be forward, backward or bidirectional, "
                "received direction = {}".format(direction))

-        self.could_use_cudnn = get_device().startswith(
-            "gpu:") and get_cudnn_version()
+        self.could_use_cudnn = True
        self.could_use_cudnn &= direction != "backward"
        self.could_use_cudnn &= len(self.parameters()) == num_layers * 4 * (
            2 if direction == "bidirectional" else 1)