Merge pull request #5255 from guoshengCS/add-GRUOp-dev

Add GRU Operator
7 years ago · 1d7954fc3f
parent c8122e09c5 7a1d5e9d6a
commit 1d7954fc3f
12 changed files with 1758 additions and 1 deletions
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -142,7 +142,8 @@ set(DEPS_OPS
    nccl_op
    sequence_conv_op
    lod_rank_table_op
-    lstm_op)
+    lstm_op
    gru_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@ -156,6 +157,7 @@ op_library(nccl_op DEPS nccl_common)
 endif()
 op_library(sequence_conv_op DEPS context_project)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
        DEPS net_op tensor_array)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@ -0,0 +1,220 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/operators/gru_op.h"
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 class GRUOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(%s) of GRUOp should not be null.", "Input");
    PADDLE_ENFORCE(ctx->HasInput("Weight"),
                   "Input(%s) of GRUOp should not be null.", "Weight");
    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
                   "Output(%s) of GRUOp should not be null.", "BatchGate");
    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
                   "Output(%s) of GRUOp should not be null.",
                   "BatchResetHiddenPrev");
    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                   "Output(%s) of GRUOp should not be null.", "Hidden");
    auto input_dims = ctx->GetInputDim("Input");
    auto weight_dims = ctx->GetInputDim("Weight");
    int input_size = input_dims[1];
    int frame_size = weight_dims[0];
    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
                      "The input_size must be 3 times of frame_size in GRUOp.");
    PADDLE_ENFORCE_EQ(
        weight_dims[1], frame_size * 3,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
    if (ctx->HasInput("H0")) {
      auto h0_dims = ctx->GetInputDim("H0");
      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                        "The width of H0 must be equal to frame_size.");
    }
    if (ctx->HasInput("Bias")) {
      auto bias_dims = ctx->GetInputDim("Bias");
      int bias_height = bias_dims[0];
      int bias_width = bias_dims[1];
      PADDLE_ENFORCE_EQ(bias_height, 1,
                        "The shape of Bias must be [1, frame_size * 3].");
      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
                        "The shape of Bias must be [1, frame_size * 3].");
    }
    ctx->SetOutputDim("BatchGate", input_dims);
    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
    ctx->ShareLoD("Input", "Hidden");
  }
 };
 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(LoDTensor) The first input is a LodTensor, which supports "
             "variable-time length input sequence. The underlying tensor in "
             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
             "total time steps in this mini-batch, D is the hidden size.");
    AddInput("H0",
             "(Tensor, optional) The initial hidden state is an optional "
             "input. This is a tensor with shape (N x D), where N is the "
             "batch size, D is the hidden size.")
        .AsDispensable();
    AddInput(
        "Weight",
        "(Tensor) The learnable hidden-hidden weight matrix with shape "
        "(D x 3D), where D is the hidden size. The elements continuous in "
        "memory can be divided into two parts. The first part are weights of "
        "the update gate and reset gate with shape (D x 2D), and the second "
        "part are weights of output candidate with shape (D x D).");
    AddInput("Bias",
             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
             "bias of the update gate, reset gate and output candidate.")
        .AsDispensable();
    AddOutput("BatchGate",
              "(LoDTensor) To compute with batches, sequence data will be "
              "reorganized into several successive batches each containing "
              "data from the same time step. The LoDTensor BatchGate contains "
              "the update gate, reset gate and output candidate values "
              "organized in batches. The LoD size is 2. The first LoD contains "
              "the batch offsets and the second LoD contains the indexes in "
              "the raw sequence data.")
        .AsIntermediate();
    AddOutput(
        "BatchResetHiddenPrev",
        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
        "with `BatchGate`.")
        .AsIntermediate();
    AddOutput(
        "BatchHidden",
        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
        "with `BatchGate`.")
        .AsIntermediate();
    AddOutput(
        "Hidden",
        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
        "with `BatchGate`.");
    AddAttr<std::string>("activation",
                         "(string, default tanh) "
                         "The activation type used for output candidate {h}_t.")
        .SetDefault("tanh");
    AddAttr<std::string>(
        "gate_activation",
        "(string, default sigmoid) "
        "The activation type used in update gate and reset gate.")
        .SetDefault("sigmoid");
    AddAttr<bool>("is_reverse",
                  "(bool, defalut: False) "
                  "whether to compute reversed GRU.")
        .SetDefault(false);
    AddComment(R"DOC(
 GRU Operator implements part calculations of the complete GRU as following:
 \f[
 update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
 reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
 output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
 output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
 \f]
@note To implement the complete GRU, fully-connected operator must be used  
 before to feed xu, xr and xc as the Input of GRU operator.
 )DOC");
  }
 };
 class GRUGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(%s) of GRUGradOp should not be null.", "Input");
    PADDLE_ENFORCE(ctx->HasInput("Weight"),
                   "Input(%s) of GRUGradOp should not be null.", "Weight");
    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
                   "Input(%s) of GRUGradOp should not be null.",
                   "BatchResetHiddenPrev");
    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
    auto input_dims = ctx->GetInputDim("Input");
    auto weight_dims = ctx->GetInputDim("Weight");
    int input_size = input_dims[1];
    int frame_size = weight_dims[0];
    int weight_height = weight_dims[0];
    int weight_width = weight_dims[1];
    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
                      "The input_size must be 3 times of frame_size in GRUOp.");
    PADDLE_ENFORCE_EQ(
        weight_height, frame_size,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
    PADDLE_ENFORCE_EQ(
        weight_width, frame_size * 3,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
    if (ctx->HasInput("H0")) {
      auto h0_dims = ctx->GetInputDim("H0");
      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                        "The width of H0 must be equal to frame_size.");
      auto h0_grad_name = framework::GradVarName("H0");
      if (ctx->HasOutput(h0_grad_name))
        ctx->SetOutputDim(h0_grad_name, h0_dims);
    }
    if (ctx->HasInput("Bias")) {
      auto bias_dims = ctx->GetInputDim("Bias");
      int bias_height = bias_dims[0];
      int bias_width = bias_dims[1];
      PADDLE_ENFORCE_EQ(bias_height, 1,
                        "The shape of Bias must be [1, frame_size * 3].");
      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
                        "The shape of Bias must be [1, frame_size * 3].");
      auto bias_grad_name = framework::GradVarName("Bias");
      if (ctx->HasOutput(bias_grad_name))
        ctx->SetOutputDim(bias_grad_name, bias_dims);
    }
    auto input_grad_name = framework::GradVarName("Input");
    if (ctx->HasOutput(input_grad_name))
      ctx->SetOutputDim(input_grad_name, input_dims);
    auto weight_grad_name = framework::GradVarName("Weight");
    if (ctx->HasOutput(weight_grad_name))
      ctx->SetOutputDim(weight_grad_name, weight_dims);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
 REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(gru_grad,
                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/gru_op.cu
+++ b/paddle/operators/gru_op.cu
@ -0,0 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/gru_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(gru_grad,
                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@ -0,0 +1,231 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #pragma once
 #include "paddle/operators/math/gru_compute.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/sequence2batch.h"
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
 class GRUKernel : public framework::OpKernel<T> {
 public:
  void BatchCompute(const framework::ExecutionContext& context) const {
    auto* input = context.Input<LoDTensor>("Input");
    auto* h0 = context.Input<Tensor>("H0");
    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    auto* weight = context.Input<Tensor>("Weight");
    const T* weight_data = weight->data<T>();
    auto* bias = context.Input<Tensor>("Bias");
    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
    batch_gate->mutable_data<T>(context.GetPlace());
    auto* batch_reset_hidden_prev =
        context.Output<LoDTensor>("BatchResetHiddenPrev");
    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
    batch_hidden->mutable_data<T>(context.GetPlace());
    auto* hidden = context.Output<LoDTensor>("Hidden");
    hidden->mutable_data<T>(context.GetPlace());
    context.ShareLoD("Input", "Hidden");
    auto hidden_dims = hidden->dims();
    bool is_reverse = context.Attr<bool>("is_reverse");
    math::LoDTensor2BatchFunctor<Place, T> to_batch;
    to_batch(context.device_context(), *input, *batch_gate, true, is_reverse);
    int frame_size = hidden_dims[1];
    int batch_size = hidden_dims[0];
    auto g = EigenMatrix<T>::From(*batch_gate);
    auto place = context.GetEigenDevice<Place>();
    if (bias) {
      auto b = EigenMatrix<T>::From(*bias);
      g.device(place) = g +
                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
    }
    math::hl_gru_value<T> gru_value;
    gru_value.gateWeight = const_cast<T*>(weight_data);
    gru_value.stateWeight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
    gru_value.prevOutValue = const_cast<T*>(h0_data);
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
    for (size_t n = 0; n < num_batch; n++) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);
      int cur_batch_size = bend - bstart;
      Tensor gate_t = batch_gate->Slice(bstart, bend);
      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
      gru_value.outputValue = hidden_t.data<T>();
      gru_value.gateValue = gate_t.data<T>();
      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
      math::GRUUnitFunctor<Place, T>::compute(
          context.device_context(), gru_value, frame_size, cur_batch_size,
          math::ActiveType(context.Attr<std::string>("activation")),
          math::ActiveType(context.Attr<std::string>("gate_activation")));
      gru_value.prevOutValue = gru_value.outputValue;
    }
    math::Batch2LoDTensorFunctor<Place, T> to_seq;
    batch_hidden->set_lod(batch_gate->lod());
    to_seq(context.device_context(), *batch_hidden, *hidden);
  }
  void Compute(const framework::ExecutionContext& context) const override {
    BatchCompute(context);
  }
 };
 template <typename Place, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
 public:
  void BatchCompute(const framework::ExecutionContext& context) const {
    auto* h0 = context.Input<Tensor>("H0");
    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    auto* weight = context.Input<Tensor>("Weight");
    const T* weight_data = weight->data<T>();
    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
    auto* batch_reset_hidden_prev =
        context.Input<LoDTensor>("BatchResetHiddenPrev");
    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
    auto* hidden = context.Input<LoDTensor>("Hidden");
    auto* hidden_grad =
        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
    auto* input_grad =
        context.Output<LoDTensor>(framework::GradVarName("Input"));
    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
    auto* weight_grad =
        context.Output<Tensor>(framework::GradVarName("Weight"));
    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
    auto gate_dims = batch_gate->dims();
    auto hidden_dims = hidden->dims();
    int frame_size = hidden_dims[1];
    math::LoDTensor2BatchFunctor<Place, T> to_batch;
    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
                                                 context.GetPlace());
    math::SetConstant<Place, T> zero;
    zero(context.device_context(), &batch_hidden_grad, static_cast<T>(0.0));
    zero(context.device_context(), &batch_gate_grad, static_cast<T>(0.0));
    zero(context.device_context(), &batch_reset_hidden_prev_grad,
         static_cast<T>(0.0));
    bool is_reverse = context.Attr<bool>("is_reverse");
    batch_hidden_grad.set_lod(batch_hidden->lod());
    to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false,
             is_reverse);
    math::hl_gru_value<T> gru_value;
    gru_value.gateWeight = const_cast<T*>(weight_data);
    gru_value.stateWeight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
    math::hl_gru_grad<T> gru_grad;
    if (weight_grad) {
      gru_grad.gateWeightGrad =
          weight_grad->mutable_data<T>(context.GetPlace());
      zero(context.device_context(), weight_grad, static_cast<T>(0.0));
      gru_grad.stateWeightGrad =
          weight_grad->data<T>() + 2 * frame_size * frame_size;
    } else {
      gru_grad.gateWeightGrad = nullptr;
      gru_grad.stateWeightGrad = nullptr;
    }
    auto batch_starts = batch_hidden_grad.lod()[0];
    size_t num_batch = batch_starts.size() - 1;
    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);
      int cur_batch_size = bend - bstart;
      Tensor gate_t = batch_gate->Slice(bstart, bend);
      gru_value.gateValue = gate_t.data<T>();
      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
      gru_grad.outputGrad = hidden_grad_t.data<T>();
      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
      gru_grad.gateGrad = gate_grad_t.data<T>();
      Tensor reset_hidden_prev_grad_t =
          batch_reset_hidden_prev_grad.Slice(bstart, bend);
      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
      if (n == 0) {
        gru_value.prevOutValue = const_cast<T*>(h0_data);
        if (h0_grad) {
          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
          zero(context.device_context(), h0_grad, static_cast<T>(0.0));
          gru_grad.prevOutGrad = h0_grad_data;
        } else {
          gru_grad.prevOutGrad = nullptr;
        }
      } else {
        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
        gru_value.prevOutValue = hidden_prev_t.data<T>();
        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
      }
      math::GRUUnitGradFunctor<Place, T>::compute(
          context.device_context(), gru_value, gru_grad, frame_size,
          cur_batch_size,
          math::ActiveType(context.Attr<std::string>("activation")),
          math::ActiveType(context.Attr<std::string>("gate_activation")));
    }
    if (input_grad) {
      input_grad->mutable_data<T>(context.GetPlace());
      math::Batch2LoDTensorFunctor<Place, T> to_seq;
      batch_gate_grad.set_lod(batch_gate->lod());
      to_seq(context.device_context(), batch_gate_grad, *input_grad);
    }
    if (bias_grad) {
      bias_grad->mutable_data<T>(context.GetPlace());
      auto d_b = EigenMatrix<T>::From(*bias_grad);
      auto d_g = EigenMatrix<T>::From(batch_gate_grad);
      auto place = context.GetEigenDevice<Place>();
      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
    }
  }
  void Compute(const framework::ExecutionContext& context) const override {
    BatchCompute(context);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@ -12,6 +12,7 @@ if(WITH_GPU)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@ -22,6 +23,7 @@ else()
    cc_library(context_project SRCS context_project.cc DEPS device_context)
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@ -0,0 +1,203 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <type_traits>
 #include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/operators/math/gru_compute.h"
 #include "paddle/platform/cuda_helper.h"
 #include "paddle/platform/device_context.h"
 #include <glog/logging.h>
 namespace paddle {
 namespace operators {
 namespace math {
 namespace detail {
 /*
 * threads(framePerBlock, batchPerBlock)
 * grid(frameBlocks, batchBlocks)
 */
 template <class OpResetOutput, bool isBatch, typename T>
 __global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
                                        T *gateValue, T *resetOutputValue,
                                        T *prevOutputValue, int frameSize,
                                        int batchSize,
                                        activation_mode_t active_gate) {
  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frameIdx >= frameSize) return;
  int batchIdx = 0;
  if (isBatch) {
    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
    if (batchIdx >= batchSize) return;
    gateValue += batchIdx * 3 * frameSize;
    resetOutputValue += batchIdx * frameSize;
  }
  T rPrevOut = 0;
  T rValueResetOutput;
  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
  if (prevOutputValue) {
    if (isBatch) prevOutputValue += batchIdx * frameSize;
    rPrevOut = prevOutputValue[frameIdx];
  }
  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
                active_gate);
  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
  resetOutputValue[frameIdx] = rValueResetOutput;
 }
 /*
 * threads(framePerBlock, batchPerBlock)
 * grid(frameBlocks, batchBlocks)
 */
 template <class OpFinalOutput, bool isBatch, typename T>
 __global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
                                        T *gateValue, T *prevOutputValue,
                                        T *outputValue, int frameSize,
                                        int batchSize,
                                        activation_mode_t active_node) {
  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frameIdx >= frameSize) return;
  int batchIdx = 0;
  if (isBatch) {
    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
    if (batchIdx >= batchSize) return;
    gateValue += batchIdx * 3 * frameSize;
    outputValue += batchIdx * frameSize;
  }
  T rOutput;
  T rPrevOut = 0;
  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
  if (prevOutputValue) {
    if (isBatch) prevOutputValue += batchIdx * frameSize;
    rPrevOut = prevOutputValue[frameIdx];
  }
  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
                active_node);
  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
  outputValue[frameIdx] = rOutput;
 }
 /*
 * threads(framePerBlock, batchPerBlock)
 * grid(frameBlocks, batchBlocks)
 */
 template <class OpStateGrad, bool isBatch, typename T>
 __global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
                                       T *gateGrad, T *prevOutValue,
                                       T *prevOutGrad, T *outputGrad,
                                       int frameSize, int batchSize,
                                       activation_mode_t active_node) {
  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frameIdx >= frameSize) return;
  int batchIdx = 0;
  if (isBatch) {
    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
    if (batchIdx >= batchSize) return;
    gateValue += batchIdx * 3 * frameSize;
    gateGrad += batchIdx * 3 * frameSize;
    outputGrad += batchIdx * frameSize;
  }
  T rUpdateGateGrad;
  T rFrameStateGrad;
  T rPrevOutValue = 0;
  T rPrevOutGrad = 0;
  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
  T rOutGrad = outputGrad[frameIdx];
  if (prevOutValue && prevOutGrad) {
    if (isBatch) prevOutValue += batchIdx * frameSize;
    rPrevOutValue = prevOutValue[frameIdx];
    if (isBatch) prevOutGrad += batchIdx * frameSize;
    rPrevOutGrad = prevOutGrad[frameIdx];
  }
  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
              active_node);
  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
  if (prevOutGrad) {
    prevOutGrad[frameIdx] = rPrevOutGrad;
  }
 }
 /*
 * threads(framePerBlock, batchPerBlock)
 * grid(frameBlocks, batchBlocks)
 */
 template <class OpResetGrad, bool isBatch, typename T>
 __global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
                                       T *gateGrad, T *prevOutValue,
                                       T *prevOutGrad, T *resetOutputGrad,
                                       int frameSize, int batchSize,
                                       activation_mode_t active_gate) {
  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frameIdx >= frameSize) return;
  int batchIdx = 0;
  if (isBatch) {
    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
    if (batchIdx >= batchSize) return;
    gateValue += batchIdx * 3 * frameSize;
    gateGrad += batchIdx * 3 * frameSize;
    resetOutputGrad += batchIdx * frameSize;
  }
  T rResetGateGrad;
  T rPrevOutValue = 0;
  T rPrevOutGrad = 0;
  T rResetOutputGrad = 0;
  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
  if (prevOutValue && prevOutGrad) {
    if (isBatch) prevOutValue += batchIdx * frameSize;
    if (isBatch) prevOutGrad += batchIdx * frameSize;
    rPrevOutValue = prevOutValue[frameIdx];
    rPrevOutGrad = prevOutGrad[frameIdx];
    rResetOutputGrad = resetOutputGrad[frameIdx];
  }
  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
              active_gate);
  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
  if (prevOutGrad) {
    prevOutGrad[frameIdx] = rPrevOutGrad;
  }
 }
 }  // namespace detail
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@ -0,0 +1,155 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/platform/hostdevice.h"
 #include <type_traits>
 // TODO(guosheng): refine code style in gru_kernel
 namespace paddle {
 namespace operators {
 namespace math {
 namespace detail {
 namespace forward {
 template <typename T>
 class gru_resetOutput {
 public:
  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
                             T &valueResetOutput, activation_mode_t actGate) {
    valueUpdateGate = activation(valueUpdateGate, actGate);
    valueResetGate = activation(valueResetGate, actGate);
    valueResetOutput = prevOut * valueResetGate;
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
                             __m256 &prevOut, __m256 &valueResetOutput,
                             activation_mode_t actGate) {
    valueUpdateGate = activation(valueUpdateGate, actGate);
    valueResetGate = activation(valueResetGate, actGate);
    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
  }
 #endif
 #endif
 };
 template <typename T>
 class gru_finalOutput {
 public:
  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
                             T &valueOutput, activation_mode_t actInput) {
    valueFrameState = activation(valueFrameState, actInput);
    valueOutput = prevOut - (valueUpdateGate * prevOut) +
                  (valueUpdateGate * valueFrameState);
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
                             __m256 &prevOut, __m256 &valueOutput,
                             activation_mode_t actInput) {
    valueFrameState = activation(valueFrameState, actInput);
    valueOutput = _mm256_add_ps(
        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
        _mm256_mul_ps(valueUpdateGate, valueFrameState));
  }
 #endif
 #endif
 };
 }  // namespace forward
 namespace backward {
 template <typename T>
 class gru_stateGrad {
 public:
  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
                             T &valueFrameState, T &gradFrameState,
                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
                             activation_mode_t actInput) {
    gradUpdateGate = (gradOutput * valueFrameState);
    gradUpdateGate -= (gradOutput * valuePrevOut);
    gradPrevOut -= (gradOutput * valueUpdateGate);
    gradPrevOut += gradOutput;
    gradFrameState =
        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
                             __m256 &valueFrameState, __m256 &gradFrameState,
                             __m256 &valuePrevOut, __m256 &gradPrevOut,
                             __m256 &gradOutput, activation_mode_t actInput) {
    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
    gradUpdateGate =
        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
    gradPrevOut = _mm256_add_ps(
        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
        gradOutput);
    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
                                valueFrameState, actInput);
  }
 #endif
 #endif
 };
 template <typename T>
 class gru_resetGrad {
 public:
  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
                             T &valueResetGate, T &gradResetGate,
                             T &valuePrevOut, T &gradPrevOut,
                             T &gradResetOutput, activation_mode_t actGate) {
    gradResetGate = (gradResetOutput * valuePrevOut);
    gradPrevOut += (gradResetOutput * valueResetGate);
    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
                             __m256 &valueResetGate, __m256 &gradResetGate,
                             __m256 &valuePrevOut, __m256 &gradPrevOut,
                             __m256 &gradResetOutput,
                             activation_mode_t actGate) {
    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
    gradPrevOut = _mm256_add_ps(gradPrevOut,
                                _mm256_mul_ps(gradResetOutput, valueResetGate));
    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
  }
 #endif
 #endif
 };
 }  // namespace backward
 }  // namespace detail
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@ -0,0 +1,102 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/math/gru_compute.h"
 #include "paddle/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/operators/math/detail/gru_kernel.h"
 #include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
                      hl_gru_value<T> value, int frameSize, int batchSize,
                      activation_mode_t active_node,
                      activation_mode_t active_gate) {
 #ifndef __NVCC__
    if (value.prevOutValue) {
      math::gemm<platform::CPUPlace, T>(
          context, false, false, batchSize, frameSize * 2, frameSize, 1,
          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
          value.gateValue, frameSize * 3);
    }
    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
                                 frameSize, batchSize, active_gate);
    if (value.prevOutValue) {
      math::gemm<platform::CPUPlace, T>(
          context, false, false, batchSize, frameSize, frameSize, 1,
          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
          value.gateValue + frameSize * 2, frameSize * 3);
    }
    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                 frameSize, batchSize, active_node);
 #endif
  }
 };
 template <typename T>
 struct GRUUnitGradFunctor<platform::CPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
                      int batchSize, activation_mode_t active_node,
                      activation_mode_t active_gate) {
 #ifndef __NVCC__
    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                grad, frameSize, batchSize, active_node);
    if (value.prevOutValue && grad.prevOutGrad) {
      math::gemm<platform::CPUPlace, T>(
          context, false, true, batchSize, frameSize, frameSize, 1,
          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
          frameSize, 0, grad.resetOutputGrad, frameSize);
      if (grad.stateWeightGrad) {
        math::gemm<platform::CPUPlace, T>(
            context, true, false, frameSize, frameSize, batchSize, 1,
            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
      }
    }
    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
                                grad, frameSize, batchSize, active_gate);
    if (grad.prevOutGrad && value.prevOutValue) {
      math::gemm<platform::CPUPlace, T>(
          context, false, true, batchSize, frameSize, frameSize * 2, 1,
          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
          grad.prevOutGrad, frameSize);
      if (grad.gateWeightGrad) {
        math::gemm<platform::CPUPlace, T>(
            context, true, false, frameSize, frameSize * 2, batchSize, 1,
            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
            grad.gateWeightGrad, frameSize * 2);
      }
    }
 #endif
  }
 };
 template struct GRUUnitFunctor<platform::CPUPlace, float>;
 template struct GRUUnitFunctor<platform::CPUPlace, double>;
 template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
 template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@ -0,0 +1,178 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/math/detail/gru_gpu_kernel.h"
 #include "paddle/operators/math/detail/gru_kernel.h"
 #include "paddle/operators/math/gru_compute.h"
 #include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::GPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
                      hl_gru_value<T> value, int frameSize, int batchSize,
                      activation_mode_t active_node,
                      activation_mode_t active_gate) {
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
    dim3 threads;
    dim3 grid;
    if (batchSize == 1) {
      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
      int frameBlocks = (frameSize + 1024 - 1) / 1024;
      threads = dim3(framePerBlock, 1);
      grid = dim3(frameBlocks, 1);
    } else {
      threads = dim3(32, 32);
      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
    }
    if (value.prevOutValue) {
      math::gemm<platform::GPUPlace, T>(
          context, false, false, batchSize, frameSize * 2, frameSize, 1,
          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
          value.gateValue, frameSize * 3);
    }
    if (batchSize == 1) {
      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
                                      /* isBatch= */ false,
                                      T><<<grid, threads, 0, stream>>>(
          detail::forward::gru_resetOutput<T>(), value.gateValue,
          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
          active_gate);
    } else {
      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
                                      /* isBatch= */ true,
                                      T><<<grid, threads, 0, stream>>>(
          detail::forward::gru_resetOutput<T>(), value.gateValue,
          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
          active_gate);
    }
    if (value.prevOutValue) {
      math::gemm<platform::GPUPlace, T>(
          context, false, false, batchSize, frameSize, frameSize, 1,
          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
          value.gateValue + frameSize * 2, frameSize * 3);
    }
    if (batchSize == 1) {
      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
                                      /* isBatch= */ false,
                                      T><<<grid, threads, 0, stream>>>(
          detail::forward::gru_finalOutput<T>(), value.gateValue,
          value.prevOutValue, value.outputValue, frameSize, batchSize,
          active_node);
    } else {
      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
                                      /* isBatch= */ true,
                                      T><<<grid, threads, 0, stream>>>(
          detail::forward::gru_finalOutput<T>(), value.gateValue,
          value.prevOutValue, value.outputValue, frameSize, batchSize,
          active_node);
    }
  }
 };
 template <typename T>
 struct GRUUnitGradFunctor<platform::GPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
                      int batchSize, activation_mode_t active_node,
                      activation_mode_t active_gate) {
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
    dim3 threads;
    dim3 grid;
    if (batchSize == 1) {
      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
      int frameBlocks = (frameSize + 1024 - 1) / 1024;
      threads = dim3(framePerBlock, 1);
      grid = dim3(frameBlocks, 1);
    } else {
      threads = dim3(32, 32);
      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
    }
    if (batchSize == 1) {
      detail::KeGruBackwardStateGrad<
          detail::backward::gru_stateGrad<T>,
          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
          batchSize, active_node);
    } else {
      detail::KeGruBackwardStateGrad<
          detail::backward::gru_stateGrad<T>,
          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
          batchSize, active_node);
    }
    if (value.prevOutValue && grad.prevOutGrad) {
      math::gemm<platform::GPUPlace, T>(
          context, false, true, batchSize, frameSize, frameSize, 1,
          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
          frameSize, 0, grad.resetOutputGrad, frameSize);
      if (grad.stateWeightGrad) {
        math::gemm<platform::GPUPlace, T>(
            context, true, false, frameSize, frameSize, batchSize, 1,
            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
      }
    }
    if (batchSize == 1) {
      detail::KeGruBackwardResetGrad<
          detail::backward::gru_resetGrad<T>,
          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
          batchSize, active_gate);
    } else {
      detail::KeGruBackwardResetGrad<
          detail::backward::gru_resetGrad<T>,
          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
          batchSize, active_gate);
    }
    if (grad.prevOutGrad && value.prevOutValue) {
      math::gemm<platform::GPUPlace, T>(
          context, false, true, batchSize, frameSize, frameSize * 2, 1,
          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
          grad.prevOutGrad, frameSize);
      if (grad.gateWeightGrad) {
        math::gemm<platform::GPUPlace, T>(
            context, true, false, frameSize, frameSize * 2, batchSize, 1,
            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
            grad.gateWeightGrad, frameSize * 2);
      }
    }
  }
 };
 template struct GRUUnitFunctor<platform::GPUPlace, float>;
 template struct GRUUnitFunctor<platform::GPUPlace, double>;
 template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
 template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@ -0,0 +1,61 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/operators/math/lstm_compute.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 namespace paddle {
 namespace operators {
 namespace math {
 // TODO(guosheng): refine code style in gru_compute
 template <typename T>
 struct hl_gru_value {
  T *gateWeight;
  T *stateWeight;
  T *gateValue;
  T *resetOutputValue;
  T *outputValue;
  T *prevOutValue;
 };
 template <typename T>
 struct hl_gru_grad {
  T *gateWeightGrad;
  T *stateWeightGrad;
  T *gateGrad;
  T *resetOutputGrad;
  T *outputGrad;
  T *prevOutGrad;
 };
 template <typename Place, typename T>
 struct GRUUnitFunctor {
  static void compute(const platform::DeviceContext &context,
                      hl_gru_value<T> value, int frameSize, int batchSize,
                      activation_mode_t active_node,
                      activation_mode_t active_gate);
 };
 template <typename Place, typename T>
 struct GRUUnitGradFunctor {
  static void compute(const platform::DeviceContext &context,
                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
                      int batchSize, activation_mode_t active_node,
                      activation_mode_t active_gate);
 };
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/python/paddle/v2/framework/tests/test_gru_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
@ -0,0 +1,156 @@
 import unittest
 import numpy as np
 import math
 from op_test import OpTest
 from test_lstm_op import identity, sigmoid, tanh, relu
 class TestGRUOp(OpTest):
    batch_size = 9
    frame_size = 5
    activate = {
        'identity': identity,
        'sigmoid': sigmoid,
        'tanh': tanh,
        'relu': relu
    }
    @staticmethod
    def seq_to_batch(lod, is_reverse):
        idx_in_seq_list = []
        seq_starts = lod[0]
        seq_lens = []
        for i in range(len(seq_starts) - 1):
            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
        sorted_seqs = sorted(
            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
        num_batch = seq_lens[sorted_seqs[0]]
        for batch_idx in range(num_batch):
            idx_in_seq = []
            for i in range(len(seq_lens)):
                if seq_lens[sorted_seqs[i]] <= batch_idx:
                    break
                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
                       ) if is_reverse else (
                           seq_starts[sorted_seqs[i]] + batch_idx)
                idx_in_seq.append(idx)
            idx_in_seq_list.append(idx_in_seq)
        return idx_in_seq_list
    def gru_step(self, x, h_p, w, b):
        batch_size = x.shape[0]
        frame_size = w.shape[0]
        g = x + np.tile(b, (batch_size, 1))
        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
            (frame_size, frame_size * 2))
        u_r = self.activate[self.attrs['gate_activation']](np.dot(
            h_p, w_u_r) + g[:, :frame_size * 2])
        u = u_r[:, :frame_size]
        r = u_r[:, frame_size:frame_size * 2]
        r_h_p = r * h_p
        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
            (frame_size, frame_size))
        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
                                                    g[:, frame_size * 2:])
        g = np.hstack((u_r, c))
        h = u * c + (1 - u) * h_p
        return g, r_h_p, h
    def gru(self):
        input, lod = self.inputs['Input']
        w = self.inputs['Weight']
        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
            (1, self.frame_size * 3))
        batch_gate = self.outputs['BatchGate']
        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
        batch_hidden = self.outputs['BatchHidden']
        hidden = self.outputs['Hidden']
        idx_in_seq_list = self.idx_in_seq_list
        h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
            (len(idx_in_seq_list[0]), self.frame_size))
        num_batch = len(idx_in_seq_list)
        end_idx = 0
        for batch_idx in range(num_batch):
            x = input[idx_in_seq_list[batch_idx]]
            g, r_h_p, h = self.gru_step(x, h_p, w, b)
            if batch_idx < (num_batch - 1):
                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
            start_idx = end_idx
            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
            batch_gate[start_idx:end_idx] = g
            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
            batch_hidden[start_idx:end_idx] = h
            hidden[idx_in_seq_list[batch_idx]] = h
        return batch_gate, batch_reset_hidden_prev, hidden
    def set_data(self):
        lod = [[0, 2, 6, self.batch_size]]
        self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
        batch_size = self.batch_size
        frame_size = self.frame_size
        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
                            frame_size).astype('float64')
        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
        bias = np.random.rand(1, frame_size * 3).astype('float64')
        self.inputs = {
            'Input': (input, lod),
            'H0': h0,
            'Weight': weight,
            'Bias': bias
        }
        self.outputs = {
            'BatchGate': np.zeros(
                (batch_size, frame_size * 3), dtype='float64'),
            'BatchResetHiddenPrev': np.zeros(
                (batch_size, frame_size), dtype='float64'),
            'BatchHidden': np.zeros(
                (batch_size, frame_size), dtype='float64'),
            'Hidden': np.zeros(
                (batch_size, frame_size), dtype='float64')
        }
    def set_confs(self):
        self.is_reverse = False
        self.attrs = {
            'activation': 'tanh',
            'gate_activation': 'sigmoid',
            'is_reverse': self.is_reverse
        }
    def setUp(self):
        self.op_type = "gru"
        self.set_confs()
        self.set_data()
        self.gru()
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
 class TestGRUOpNoInitial(TestGRUOp):
    def set_data(self):
        super(TestGRUOpNoInitial, self).set_data()
        self.inputs.pop('H0')
    def test_check_grad(self):
        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
 class TestGRUOpReverse(TestGRUOp):
    def set_confs(self):
        self.is_reverse = True
        self.attrs = {
            'activation': 'identity',
            'gate_activation': 'sigmoid',
            'is_reverse': self.is_reverse
        }
 if __name__ == "__main__":
    unittest.main()