add lstm

5 years ago · 9bcdf4cbdc
parent 650a45b233
commit 9bcdf4cbdc
15 changed files with 1185 additions and 37 deletions
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc
@ -0,0 +1,120 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/mkldnn/lstm_cpu_kernel.h"
+#include <string>
+#include "common/utils.h"
+#include "kernel/cpu/mkldnn/mkl_kernel_engine.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
+  input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
+  hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
+  num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
+  batch_size_ = SizeToInt(src_shape[1]);
+  seq_len_ = SizeToInt(src_shape[0]);
+  num_directions_ = 1;
+  if (bidirectional_) {
+    num_directions_ = 2;
+  }
+  int gate_size = 4 * hidden_size_;
+  for (int i = 0; i < num_layers_; ++i) {
+    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
+    weight_h_size_ += gate_size * hidden_size_;
+  }
+  weight_size_ = weight_size_ * num_directions_;
+  weight_h_size_ = weight_h_size_ * num_directions_;
+}
+
+bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                           const std::vector<kernel::AddressPtr> & /*workspace*/,
+                           const std::vector<kernel::AddressPtr> &outputs) {
+  using dt = dnnl::memory::data_type;
+  using tag = dnnl::memory::format_tag;
+  using dim = dnnl::memory::dims;
+  auto eng = MKLKernelEngine::Get().engine();
+  dnnl::stream s(eng);
+  auto formatted_md = [](dim dimensions, tag layout) { return dnnl::memory::desc{{dimensions}, dt::f32, layout}; };
+  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
+  if (bidirectional_) {
+    direction = dnnl::rnn_direction::bidirectional_concat;
+  }
+
+  dim src_dims = {seq_len_, batch_size_, input_size_};
+  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim weights_dims = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
+  dim weights_h_dims = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
+  dim bias_dims = {num_layers_, num_directions_, 4, hidden_size_};
+  dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
+  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
+  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
+  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
+  dnnl::memory::desc weights_desc = formatted_md(weights_dims, tag::ldigo);
+  dnnl::memory::desc weights_h_desc = formatted_md(weights_h_dims, tag::ldigo);
+  dnnl::memory::desc bias_desc = formatted_md(bias_dims, tag::ldgo);
+  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
+  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
+  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
+  dnnl::lstm_forward::desc desc =
+    dnnl::lstm_forward::desc(dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
+                             weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto prim_desc = dnnl::lstm_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
+  auto workspace_memory = dnnl::memory(prim_desc.workspace_desc(), eng);
+  auto src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
+  write_to_dnnl_memory(inputs[0]->addr, src_memory);
+
+  auto src_h_memory = dnnl::memory(prim_desc.src_iter_desc(), eng);
+  auto src_c_memory = dnnl::memory(prim_desc.src_iter_c_desc(), eng);
+  write_to_dnnl_memory(inputs[1]->addr, src_h_memory);
+  write_to_dnnl_memory(inputs[2]->addr, src_c_memory);
+
+  auto weights_memory = dnnl::memory(formatted_md(weights_dims, tag::ldigo), eng);
+  auto weights_h_memory = dnnl::memory(formatted_md(weights_h_dims, tag::ldigo), eng);
+  auto bias_memory = dnnl::memory(formatted_md(bias_dims, tag::ldgo), eng);
+  write_to_dnnl_memory(inputs[3]->addr, weights_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_, weights_h_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_, bias_memory);
+
+  auto dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
+  auto dst_h_memory = dnnl::memory(prim_desc.dst_iter_desc(), eng);
+  auto dst_c_memory = dnnl::memory(prim_desc.dst_iter_c_desc(), eng);
+  dnnl::lstm_forward fw_layer(prim_desc);
+  workspace_memory.set_data_handle(outputs[3]->addr);
+  dst_memory.set_data_handle(outputs[0]->addr);
+  dst_h_memory.set_data_handle(outputs[1]->addr);
+  dst_c_memory.set_data_handle(outputs[2]->addr);
+  fw_layer.execute(s, {{DNNL_ARG_SRC_LAYER, src_memory},
+                       {DNNL_ARG_SRC_ITER, src_h_memory},
+                       {DNNL_ARG_SRC_ITER_C, src_c_memory},
+                       {DNNL_ARG_WEIGHTS_LAYER, weights_memory},
+                       {DNNL_ARG_WEIGHTS_ITER, weights_h_memory},
+                       {DNNL_ARG_BIAS, bias_memory},
+                       {DNNL_ARG_DST_LAYER, dst_memory},
+                       {DNNL_ARG_DST_ITER, dst_h_memory},
+                       {DNNL_ARG_DST_ITER_C, dst_c_memory},
+                       {DNNL_ARG_WORKSPACE, workspace_memory}});
+  return true;
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h
@ -0,0 +1,59 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
+#define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
+#include <vector>
+#include <memory>
+#include "kernel/cpu/mkldnn/mkl_cpu_kernel.h"
+namespace mindspore {
+namespace kernel {
+class LstmCPUKernel : public MKLCPUKernel {
+ public:
+  LstmCPUKernel() = default;
+  ~LstmCPUKernel() override = default;
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  int weight_size_ = 0;
+  int weight_h_size_ = 0;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int batch_size_;
+  int seq_len_;
+  int num_directions_;
+  bool bidirectional_;
+};
+
+MS_REG_CPU_KERNEL(LSTM,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  LstmCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc
@ -0,0 +1,169 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h"
+#include <cstring>
+#include <cmath>
+#include <numeric>
+#include <string>
+#include "common/utils.h"
+#include "kernel/cpu/mkldnn/mkl_kernel_engine.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
+  input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
+  hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
+  num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
+  batch_size_ = SizeToInt(src_shape[1]);
+  seq_len_ = SizeToInt(src_shape[0]);
+  num_directions_ = 1;
+  if (bidirectional_) {
+    num_directions_ = 2;
+  }
+  int gate_size = 4 * hidden_size_;
+  for (int i = 0; i < num_layers_; ++i) {
+    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
+    weight_h_size_ += gate_size * hidden_size_;
+  }
+  weight_size_ = weight_size_ * num_directions_;
+  weight_h_size_ = weight_h_size_ * num_directions_;
+}
+
+bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                               const std::vector<kernel::AddressPtr> &workspace /*workspace*/,
+                               const std::vector<kernel::AddressPtr> &outputs) {
+  using tag = dnnl::memory::format_tag;
+  using dt = dnnl::memory::data_type;
+  using dim = dnnl::memory::dims;
+  auto eng = MKLKernelEngine::Get().engine();
+  dnnl::stream s(eng);
+  auto formatted_md = [](dim dimensions, tag layout) { return dnnl::memory::desc{{dimensions}, dt::f32, layout}; };
+  auto generic_md = [](dim dimensions) { return dnnl::memory::desc{{dimensions}, dt::f32, tag::any}; };
+  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
+  if (bidirectional_) {
+    direction = dnnl::rnn_direction::bidirectional_concat;
+  }
+  dim src_dims = {seq_len_, batch_size_, input_size_};
+  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim weights_dims = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
+  dim weights_h_dims = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
+  dim bias_dims = {num_layers_, num_directions_, 4, hidden_size_};
+  dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
+  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+
+  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
+  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
+  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
+  dnnl::memory::desc weights_desc = formatted_md(weights_dims, tag::ldigo);
+  dnnl::memory::desc weights_h_desc = formatted_md(weights_h_dims, tag::ldigo);
+  dnnl::memory::desc bias_desc = formatted_md(bias_dims, tag::ldgo);
+  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
+  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
+  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
+
+  dnnl::lstm_forward::desc forward_desc =
+    dnnl::lstm_forward::desc(dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
+                             weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(forward_desc, eng);
+
+  dnnl::lstm_backward::desc backward_desc = dnnl::lstm_backward::desc(
+    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, generic_md(weights_dims),
+    generic_md(weights_h_dims), generic_md(bias_dims), dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
+    src_c_desc, weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto prim_backward_desc = dnnl::lstm_backward::primitive_desc(backward_desc, eng, prim_forward_desc);
+  // construct fw memory
+  auto src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
+  write_to_dnnl_memory(inputs[0]->addr, src_memory);
+
+  auto src_h_memory = dnnl::memory(prim_forward_desc.src_iter_desc(), eng);
+  auto src_c_memory = dnnl::memory(prim_forward_desc.src_iter_c_desc(), eng);
+  write_to_dnnl_memory(inputs[1]->addr, src_h_memory);
+  write_to_dnnl_memory(inputs[2]->addr, src_c_memory);
+
+  auto user_weights_memory = dnnl::memory(formatted_md(weights_dims, tag::ldigo), eng);
+  auto user_weights_h_memory = dnnl::memory(formatted_md(weights_h_dims, tag::ldigo), eng);
+  auto user_bias_memory = dnnl::memory(formatted_md(bias_dims, tag::ldgo), eng);
+  write_to_dnnl_memory(inputs[3]->addr, user_weights_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_, user_weights_h_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_, user_bias_memory);
+  auto weights_memory = dnnl::memory(prim_backward_desc.weights_layer_desc(), eng);
+  auto weights_h_memory = dnnl::memory(prim_backward_desc.weights_iter_desc(), eng);
+  auto bias_memory = dnnl::memory(prim_forward_desc.bias_desc(), eng);
+  dnnl::reorder(user_weights_memory, weights_memory).execute(s, user_weights_memory, weights_memory);
+  dnnl::reorder(user_weights_h_memory, weights_h_memory).execute(s, user_weights_h_memory, weights_h_memory);
+  dnnl::reorder(user_bias_memory, bias_memory).execute(s, user_bias_memory, bias_memory);
+
+  auto dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[4]->addr), dst_memory);
+  auto dst_h_memory = dnnl::memory(prim_backward_desc.dst_iter_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[5]->addr), dst_h_memory);
+  auto dst_c_memory = dnnl::memory(prim_backward_desc.dst_iter_c_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[6]->addr), dst_c_memory);
+  auto workspace_memory = dnnl::memory(prim_forward_desc.workspace_desc(), eng);
+  write_to_dnnl_memory(inputs[10]->addr, workspace_memory);
+
+  // construct diff memory
+  auto diff_src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
+  auto diff_src_h_memory = dnnl::memory(prim_backward_desc.diff_src_iter_desc(), eng);
+  auto diff_src_c_memory = dnnl::memory(prim_backward_desc.diff_src_iter_c_desc(), eng);
+
+  auto diff_weights_memory = dnnl::memory(prim_backward_desc.diff_weights_layer_desc(), eng);
+  auto diff_weights_h_memory = dnnl::memory(prim_backward_desc.diff_weights_iter_desc(), eng);
+  auto diff_bias_memory = dnnl::memory(prim_backward_desc.diff_bias_desc(), eng);
+  auto diff_dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[7]->addr), diff_dst_memory);
+  auto diff_dst_h_memory = dnnl::memory(prim_backward_desc.diff_dst_iter_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[8]->addr), diff_dst_h_memory);
+  auto diff_dst_c_memory = dnnl::memory(prim_backward_desc.diff_dst_iter_c_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[9]->addr), diff_dst_c_memory);
+
+  diff_src_memory.set_data_handle(outputs[0]->addr);
+  diff_src_h_memory.set_data_handle(outputs[1]->addr);
+  diff_src_c_memory.set_data_handle(outputs[2]->addr);
+  diff_weights_memory.set_data_handle(outputs[3]->addr);
+  diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
+  diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
+  dnnl::lstm_backward bwd_layer(prim_backward_desc);
+  bwd_layer.execute(s, {{DNNL_ARG_SRC_LAYER, src_memory},
+                        {DNNL_ARG_SRC_ITER, src_h_memory},
+                        {DNNL_ARG_SRC_ITER_C, src_c_memory},
+                        {DNNL_ARG_WEIGHTS_LAYER, weights_memory},
+                        {DNNL_ARG_WEIGHTS_ITER, weights_h_memory},
+                        {DNNL_ARG_BIAS, bias_memory},
+                        {DNNL_ARG_DST_LAYER, dst_memory},
+                        {DNNL_ARG_DST_ITER, dst_h_memory},
+                        {DNNL_ARG_DST_ITER_C, dst_c_memory},
+                        {DNNL_ARG_DIFF_SRC_LAYER, diff_src_memory},
+                        {DNNL_ARG_DIFF_SRC_ITER, diff_src_h_memory},
+                        {DNNL_ARG_DIFF_SRC_ITER_C, diff_src_c_memory},
+                        {DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory},
+                        {DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory},
+                        {DNNL_ARG_DIFF_BIAS, diff_bias_memory},
+                        {DNNL_ARG_DIFF_DST_LAYER, diff_dst_memory},
+                        {DNNL_ARG_DIFF_DST_ITER, diff_dst_h_memory},
+                        {DNNL_ARG_DIFF_DST_ITER_C, diff_dst_c_memory},
+                        {DNNL_ARG_WORKSPACE, workspace_memory}});
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h
@ -0,0 +1,67 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
+
+#include <vector>
+#include <memory>
+#include "kernel/cpu/mkldnn/mkl_cpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+class LSTMGradCPUKernel : public MKLCPUKernel {
+ public:
+  LSTMGradCPUKernel() = default;
+  ~LSTMGradCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  int weight_size_ = 0;
+  int weight_h_size_ = 0;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int batch_size_;
+  int seq_len_;
+  int num_directions_;
+  bool bidirectional_;
+};
+
+MS_REG_CPU_KERNEL(LSTMGrad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  LSTMGradCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -98,5 +98,11 @@ void MKLCPUKernel::SetArgumentHandle(int arg_key, void *ptr) {
 }

 void MKLCPUKernel::ExecutePrimitive() { MKLKernelEngine::Get().Execute(primitive_, arguments_); }
+void MKLCPUKernel::write_to_dnnl_memory(void *handle, const dnnl::memory &mem) {
+  MKLKernelEngine::Get().write_to_dnnl_memory(handle, mem);
+}
+void MKLCPUKernel::read_from_dnnl_memory(void *handle, const dnnl::memory &mem) {
+  MKLKernelEngine::Get().read_from_dnnl_memory(handle, mem);
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -39,6 +39,8 @@ class MKLCPUKernel : public CPUKernel {
  dnnl::memory::format_tag GetDefaultFormatTag(const dnnl::memory::dims &dims) const;
  dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape);
  void ExecutePrimitive();
+  void write_to_dnnl_memory(void *handle, const dnnl::memory &mem);
+  void read_from_dnnl_memory(void *handle, const dnnl::memory &mem);
  std::unordered_map<int, dnnl::memory> arguments_;
  std::shared_ptr<dnnl::primitive> primitive_{nullptr};
 };
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -15,7 +15,10 @@
 */
 #ifndef MINDSPORE_MKL_KERNEL_ENGINE_H_
 #define MINDSPORE_MKL_KERNEL_ENGINE_H_
-
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <string>
 #include <unordered_map>
 #include <vector>
 #include <memory>
@ -39,6 +42,30 @@ class MKLKernelEngine {
  void Execute(const std::shared_ptr<dnnl::primitive> &primitive,
               const std::unordered_map<int, dnnl::memory> &arguments);

+  inline void read_from_dnnl_memory(void *handle, const dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t bytes = mem.get_desc().get_size();
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+      auto dst = reinterpret_cast<uint8_t *>(handle);
+      uint8_t *src = reinterpret_cast<uint8_t *>(mem.get_data_handle());
+      for (size_t i = 0; i < bytes; ++i) {
+        dst[i] = src[i];
+      }
+    }
+  }
+  // Read from handle, write to memory
+  inline void write_to_dnnl_memory(void *handle, const dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t bytes = mem.get_desc().get_size();
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+      auto src = reinterpret_cast<uint8_t *>(handle);
+      uint8_t *dst = reinterpret_cast<uint8_t *>(mem.get_data_handle());
+      for (size_t i = 0; i < bytes; ++i) {
+        dst[i] = src[i];
+      }
+    }
+  }
+
 private:
  MKLKernelEngine() : engine_(dnnl::engine::kind::cpu, 0), stream_(engine_) {}
  ~MKLKernelEngine() = default;
--- a/mindspore/nn/layer/lstm.py
+++ b/mindspore/nn/layer/lstm.py
@ -18,8 +18,13 @@ from mindspore.nn.cell import Cell
 from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore._checkparam import Validator as validator
+from mindspore import context
+import mindspore.nn as nn
+from mindspore.common.tensor import Tensor
+import numpy as np
+
+__all__ = ['LSTM', 'LSTMCell']

-__all__ = ['LSTM']

 class LSTM(Cell):
    r"""
@ -102,6 +107,7 @@ class LSTM(Cell):
        >>> c0 = Tensor(np.ones([1 * 2, 3, 12]).astype(np.float32))
        >>> output, (hn, cn) = net(input, h0, c0)
    """
+
    def __init__(self,
                 input_size,
                 hidden_size,
@ -118,39 +124,198 @@ class LSTM(Cell):
        self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
        self.dropout = float(dropout)
        self.bidirectional = bidirectional
+        if self.batch_first:
+            self.transpose1 = P.Transpose()
+            self.transpose2 = P.Transpose()
+        num_directions = 2 if self.bidirectional else 1
+        self.cpu_target = False
+        if context.get_context("device_target") == "CPU":
+            self.cpu_target = True
+        if not self.cpu_target:
+            self.lstm = P.LSTM(input_size=self.input_size,
+                               hidden_size=self.hidden_size,
+                               num_layers=self.num_layers,
+                               has_bias=self.has_bias,
+                               bidirectional=self.bidirectional,
+                               dropout=self.dropout)
+            weight_size = 0
+            gate_size = 4 * self.hidden_size
+            for layer in range(self.num_layers):
+                input_layer_size = self.input_size if layer == 0 else self.hidden_size * num_directions
+                increment_size = gate_size * input_layer_size
+                increment_size += gate_size * self.hidden_size
+                if self.has_bias:
+                    increment_size += 2 * gate_size
+                weight_size += increment_size * num_directions
+            self.weight = Parameter(initializer(0.0, [weight_size, 1, 1]), name='weight')
+        else:
+            layer = []
+            layer.append(nn.LSTMCell(input_size=self.input_size,
+                                     hidden_size=self.hidden_size,
+                                     layer_index=0,
+                                     has_bias=self.has_bias,
+                                     bidirectional=self.bidirectional,
+                                     dropout=self.dropout))
+            for i in range(num_layers - 1):
+                layer.append(nn.LSTMCell(input_size=self.hidden_size * num_directions,
+                                         hidden_size=self.hidden_size,
+                                         layer_index=i + 1,
+                                         has_bias=self.has_bias,
+                                         bidirectional=self.bidirectional,
+                                         dropout=self.dropout))
+            self.lstms = layer
+        self.fill = P.Fill()
+        self.shape = P.Shape()
+
+    def construct(self, x, hx):
+        if self.batch_first:
+            x = self.transpose1(x, (1, 0, 2))
+        if not self.cpu_target:
+            h, c = hx
+            output, h, c, _, _ = self.lstm(x, h, c, self.weight)
+            if self.batch_first:
+                output = self.transpose2(output, (1, 0, 2))
+            return (output, (h, c))
+        h, c = hx
+        output, hn, cn, _, _ = self.lstms[0](x, h[0], c[0])
+        for i in range(1, self.num_layers):
+            output, hn, cn, _, _ = self.lstms[i](output, h[i], c[i])
+        if self.batch_first:
+            output = self.transpose2(output, (1, 0, 2))
+        return output, hn, cn, _, _
+
+
+class LSTMCell(Cell):
+    r"""
+    LSTM (Long Short-Term Memory) layer.
+
+    Applies a LSTM layer to the input.
+
+    There are two pipelines connecting two consecutive cells in a LSTM model; one is cell state pipeline
+    and another is hidden state pipeline. Denote two consecutive time nodes as :math:`t-1` and :math:`t`.
+    Given an input :math:`x_t` at time :math:`t`, an hidden state :math:`h_{t-1}` and an cell
+    state :math:`c_{t-1}` of the layer at time :math:`{t-1}`, the cell state and hidden state at
+    time :math:`t` is computed using an gating mechanism. Input gate :math:`i_t` is designed to protect the cell
+    from perturbation by irrelevant inputs. Forget gate :math:`f_t` affords protection of the cell by forgetting
+    some information in the past, which is stored in :math:`h_{t-1}`. Output gate :math:`o_t` protects other
+    units from perturbation by currently irrelevant memory contents. Candidate cell state :math:`\tilde{c}_t` is
+    calculated with the current input, on which the input gate will be applied. Finally, current cell state
+    :math:`c_{t}` and hidden state :math:`h_{t}` are computed with the calculated gates and cell states. The complete
+    formulation is as follows.
+
+    .. math::
+        \begin{array}{ll} \\
+            i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\
+            f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\
+            \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\
+            o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\
+            c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\
+            h_t = o_t * \tanh(c_t) \\
+        \end{array}
+
+    Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b`
+    are learnable weights between the output and the input in the formula. For instance,
+    :math:`W_{ix}, b_{ix}` are the weight and bias used to transform from input :math:`x` to :math:`i`.
+    Details can be found in paper `LONG SHORT-TERM MEMORY
+    <https://www.bioinf.jku.at/publications/older/2604.pdf>`_ and
+    `Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling
+    <https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43905.pdf>`_.
+
+    Args:
+        input_size (int): Number of features of input.
+        hidden_size (int):  Number of features of hidden layer.
+        layer_index (int): index of current layer of stacked LSTM . Default: 0.
+        has_bias (bool): Specifies whether has bias `b_ih` and `b_hh`. Default: True.
+        batch_first (bool): Specifies whether the first dimension of input is batch_size. Default: False.
+        dropout (float, int): If not 0, append `Dropout` layer on the outputs of each
+            LSTM layer except the last layer. Default 0. The range of dropout is [0.0, 1.0].
+        bidirectional (bool): Specifies whether this is a bidirectional LSTM. If set True,
+            number of directions will be 2 otherwise number of directions is 1. Default: False.
+
+    Inputs:
+        - **input** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`).
+        - **h** - data type mindspore.float32 or
+          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+        - **c** - data type mindspore.float32 or
+          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+          Data type of `h' and 'c' should be the same of `input`.
+
+    Outputs:
+        `output`, `h_n`, `c_n`, 'reserve', 'state'.
+
+        - **output** (Tensor) - Tensor of shape (seq_len, batch_size, num_directions * `hidden_size`).
+        - **h** - A Tensor with shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+        - **c** - A Tensor with shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+        - **reserve** - reserved
+        - **state** - reserved
+
+    Examples:
+        >>> class LstmNet(nn.Cell):
+        >>>     def __init__(self, input_size, hidden_size, layer_index, has_bias, batch_first, bidirectional):
+        >>>         super(LstmNet, self).__init__()
+        >>>         self.lstm = nn.LSTMCell(input_size=input_size,
+        >>>                             hidden_size=hidden_size,
+        >>>                             layer_index=layer_index,
+        >>>                             has_bias=has_bias,
+        >>>                             batch_first=batch_first,
+        >>>                             bidirectional=bidirectional,
+        >>>                             dropout=0.0)
+        >>>
+        >>>     def construct(self, inp, h0, c0):
+        >>>         return self.lstm(inp, (h0, c0))
+        >>>
+        >>> net = LstmNet(10, 12, 2, has_bias=True, batch_first=True, bidirectional=False)
+        >>> input = Tensor(np.ones([3, 5, 10]).astype(np.float32))
+        >>> h0 = Tensor(np.ones([1 * 2, 3, 12]).astype(np.float32))
+        >>> c0 = Tensor(np.ones([1 * 2, 3, 12]).astype(np.float32))
+        >>> output, hn, cn, _, _ = net(input, h0, c0)
+    """
+

+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 layer_index=0,
+                 has_bias=True,
+                 batch_first=False,
+                 dropout=0,
+                 bidirectional=False):
+        super(LSTMCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = 1
+        self.layer_index = layer_index
+        self.has_bias = has_bias
+        self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.num_directions = 1
+        if self.bidirectional:
+            self.num_directions = 2
        if self.batch_first:
            self.transpose1 = P.Transpose()
            self.transpose2 = P.Transpose()
+        w_np = np.ones([(self.input_size + self.hidden_size) * self.num_directions * self.hidden_size * 4, 1]).astype(
+            np.float32) * 0.01
+        if has_bias:
+            b_np = np.ones([self.num_directions * self.hidden_size * 4, 1]).astype(
+                np.float32) * 0.01
+        else:
+            b_np = np.zeros([self.num_directions * self.hidden_size * 4, 1]).astype(
+                np.float32) * 0.01
+        wb_np = np.concatenate((w_np, b_np), axis=0).reshape([-1, 1, 1])
+        self.w = Parameter(initializer(Tensor(wb_np), wb_np.shape), name='w' + str(self.layer_index))
        self.lstm = P.LSTM(input_size=self.input_size,
                           hidden_size=self.hidden_size,
-                           num_layers=self.num_layers,
+                           num_layers=1,
                           has_bias=self.has_bias,
                           bidirectional=self.bidirectional,
                           dropout=self.dropout)

-        num_directions = 2 if self.bidirectional else 1
-
-        weight_size = 0
-        gate_size = 4 * self.hidden_size
-        for layer in range(self.num_layers):
-            input_layer_size = self.input_size if layer == 0 else self.hidden_size * num_directions
-            increment_size = gate_size * input_layer_size
-            increment_size += gate_size * self.hidden_size
-            if self.has_bias:
-                increment_size += 2 * gate_size
-            weight_size += increment_size * num_directions
-
-        self.weight = Parameter(initializer(0.0, [weight_size, 1, 1]), name='weight')
-
-        self.fill = P.Fill()
-        self.shape = P.Shape()
-
-    def construct(self, x, hx):
+    def construct(self, x, h, c):
        if self.batch_first:
            x = self.transpose1(x, (1, 0, 2))
-        h0, c0 = hx
-        output, hn, cn, _, _ = self.lstm(x, h0, c0, self.weight)
+        output, hn, cn, _, _ = self.lstm(x, h, c, self.w)
        if self.batch_first:
            output = self.transpose2(output, (1, 0, 2))
-        return (output, (hn, cn))
+        return output, hn, cn, _, _
--- a/mindspore/ops/_grad/grad_array_ops.py
+++ b/mindspore/ops/_grad/grad_array_ops.py
@ -49,6 +49,7 @@ def get_bprop_dtype(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -61,6 +62,7 @@ def get_bprop_cast(self):
    def bprop(x, t, out, dout):
        dx = cast(dout, get_dtype(x))
        return dx, zeros_like(t)
+
    return bprop


@ -70,6 +72,7 @@ def get_bprop_shape(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -82,6 +85,7 @@ def get_bprop_split(self):
        concat_op = P.Concat(axis)
        dx = concat_op(dout)
        return (dx,)
+
    return bprop


@ -91,6 +95,7 @@ def get_bprop_rank(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -101,6 +106,7 @@ def get_bprop_reshape(self):
    def bprop(x, shp, out, dout):
        shapex = shape_op(x)
        return reshape(dout, shapex), zeros_like(shp)
+
    return bprop


@ -111,6 +117,7 @@ def get_bprop_expand_dims(self):
    def bprop(x, axis, out, dout):
        shapex = shape_op(x)
        return reshape(dout, shapex), zeros_like(axis)
+
    return bprop


@ -121,6 +128,7 @@ def get_bprop_squeeze(self):
    def bprop(x, out, dout):
        shapex = shape_op(x)
        return (reshape(dout, shapex),)
+
    return bprop


@ -132,6 +140,7 @@ def get_bprop_flatten(self):
    def bprop(x, out, dout):
        dx = flatten_grad(dout, shape_op(x))
        return (dx,)
+
    return bprop


@ -166,6 +175,7 @@ def _tile_shape(multiples, shapex):
@bprop_getters.register(P.Tile)
 def get_bprop_tile(self):
    """Generate bprop for Tile"""
+
    def bprop(x, multiples, out, dout):
        shapex = shape_op(x)
        r_shape = _tile_shape(multiples, shapex)
@ -174,6 +184,7 @@ def get_bprop_tile(self):
        dx = reduce_sum(reshape(dout, r_shape), axis)
        dx = reshape(dx, shapex)
        return dx, zeros_like(multiples)
+
    return bprop


@ -183,6 +194,7 @@ def get_bprop_transpose(self):

    def bprop(x, perm, out, dout):
        return transpose(dout, invert_permutation(perm)), zeros_like(perm)
+
    return bprop


@ -198,6 +210,7 @@ def get_bprop_concat(self):
            slice_out = P.Slice()(dout, out_offset[i], shape_op(x[i]))
            dx = dx + (slice_out,)
        return (dx,)
+
    return bprop


@ -215,12 +228,12 @@ def get_bprop_slice(self):
        dx = P.Pad(_slice_grad_pad(begin, size, shape_op(x)))(dout)
        return (dx, zeros_like(begin), zeros_like(size))

-    def bprop_gpu(x, begin, size, out, dout):
+    def bprop_grad(x, begin, size, out, dout):
        dx = dx = G.SliceGrad()(dout, x, begin, size)
        return (dx, zeros_like(begin), zeros_like(size))

-    if context.get_context('device_target') == "GPU":
-        return bprop_gpu
+    if context.get_context('device_target') == "GPU" or context.get_context('device_target') == "CPU":
+        return bprop_grad
    return bprop


@ -249,6 +262,7 @@ def _generate_inverse_index(x_shape, axis):
@bprop_getters.register(P.GatherV2)
 def get_bprop_gather_v2(self):
    """Generate bprop for GatherV2"""
+
    def bprop(x, indices, axis, out, dout):
        if F.rank(dout) == 0:
            dout = P.ExpandDims()(dout, -1)
@ -265,6 +279,7 @@ def get_bprop_gather_v2(self):
        perm_2 = _generate_inverse_index(x_shp, axis)
        params_grad = transpose(params_grad, perm_2)
        return params_grad, zeros_like(indices), zeros_like(axis)
+
    return bprop


@ -286,6 +301,7 @@ def get_bprop_pack(self):
        pack_grad = P.Unpack(axis)
        out = pack_grad(dout)
        return (out,)
+
    return bprop


@ -298,6 +314,7 @@ def get_bprop_unpack(self):
        unpack_grad = P.Pack(axis)
        out = unpack_grad(dout)
        return (out,)
+
    return bprop


@ -313,6 +330,7 @@ def get_bprop_strided_slice(self):
    def bprop(x, begin, end, strides, out, dout):
        dx = input_grad(dout, shape_op(x), begin, end, strides)
        return dx, zeros_like(begin), zeros_like(end), zeros_like(strides)
+
    return bprop


@ -322,6 +340,7 @@ def get_bprop_eye(self):

    def bprop(n, m, t, out, dout):
        return zeros_like(n), zeros_like(m), zeros_like(t)
+
    return bprop


@ -332,6 +351,7 @@ def get_bprop_select(self):

    def bprop(cond, x, y, out, dout):
        return zeros_like(cond), select(cond, dout, zeros_like(x)), select(cond, zeros_like(y), dout)
+
    return bprop


@ -522,9 +542,11 @@ def get_bprop_unsorted_segment_min(self):
 def get_bprop_space_to_batch(self):
    """Generate bprop for SpaceToBatch"""
    space_to_batch_grad = P.BatchToSpace(self.block_size, self.paddings)
+
    def bprop(x, out, dout):
        dx = space_to_batch_grad(dout)
        return (dx,)
+
    return bprop


@ -532,7 +554,9 @@ def get_bprop_space_to_batch(self):
 def get_bprop_batch_to_space(self):
    """Generate bprop for BatchToSpace"""
    batch_to_space_grad = P.SpaceToBatch(self.block_size, self.crops)
+
    def bprop(x, out, dout):
        dx = batch_to_space_grad(dout)
        return (dx,)
+
    return bprop
--- a/mindspore/ops/_grad/grad_math_ops.py
+++ b/mindspore/ops/_grad/grad_math_ops.py
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@ -21,6 +21,7 @@ from ..operations import _grad_ops as G
 from ..operations import _inner_ops as inner
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
 from .grad_base import bprop_getters
+from ... import context


@bprop_getters.register(P.BiasAdd)
@ -551,6 +552,14 @@ def get_bprop_lstm(self):
        bidirectional=self.bidirectional,
        dropout=self.dropout
    )
+    lstm_grad = G.LSTMGrad(
+        input_size=self.input_size,
+        hidden_size=self.hidden_size,
+        num_layers=self.num_layers,
+        has_bias=self.has_bias,
+        bidirectional=self.bidirectional,
+        dropout=self.dropout
+    )

    def bprop(x, hx, cx, w, out, dout):
        y, _, _, reserve, state = out
@ -559,6 +568,16 @@ def get_bprop_lstm(self):
        dw = lstm_grad_weight(F.depend(x, dx), hx, y, reserve, state)
        return dx, dhx, dcx, dw

+    #
+    def bprop_cpu(x, hx, cx, w, out, dout):
+        y, hy, cy, reserve, _ = out
+        dy, dhy, dcy, _, _ = dout
+        dx, dhx, dcx, dw = lstm_grad(x, hx, cx, w, y, hy, cy, dy, dhy, dcy, reserve)
+        return dx, dhx, dcx, dw
+
+    if context.get_context('device_target') == "CPU":
+        return bprop_cpu
+
    return bprop


--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@ -107,6 +107,7 @@ class BiasAddGrad(Primitive):

 class BinaryCrossEntropyGrad(PrimitiveWithInfer):
    """Computes gradients for `BinaryCrossEntropy` operation."""
+
    @prim_attr_register
    def __init__(self, reduction='mean'):
        self.reduction = validator.check_string('reduction', reduction, ['none', 'mean', 'sum'], self.name)
@ -665,6 +666,62 @@ class LSTMGradWeight(PrimitiveWithInfer):
        return hx_dtype


+class LSTMGrad(PrimitiveWithInfer):
+    """Computes the data and weight gradients of LSTM."""
+
+    @prim_attr_register
+    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+        self.input_size = validator.check_integer('input_size', input_size, 0, Rel.GT, self.name)
+        self.hidden_size = validator.check_integer('hidden_size', hidden_size, 0, Rel.GT, self.name)
+        self.num_layers = validator.check_integer('num_layers', num_layers, 0, Rel.GT, self.name)
+        self.has_bias = validator.check_value_type('has_bias', has_bias, (bool,), self.name)
+        self.bidirectional = validator.check_value_type('bidirectional', bidirectional, (bool,), self.name)
+        self.dropout = validator.check_value_type("dropout", dropout, [float], self.name)
+        self.dropout = validator.check_number_range('dropout', dropout, 0, 1, Rel.INC_BOTH, self.name)
+
+        if bidirectional:
+            self.num_directions = 2
+        else:
+            self.num_directions = 1
+
+    def infer_shape(self, x_shape, hx_shape, cx_shape, w_shape, y_shape, hy_shape, cy_shape, dy_shape, dhy_shape,
+                    dcy_shape, reserve_shape):
+        # dhy and dcy should be same shape
+        validator.check_integer("h_shape", len(dhy_shape), 3, Rel.EQ, self.name)
+        validator.check_integer("h_shape", len(dhy_shape), len(dcy_shape), Rel.EQ, self.name)
+        validator.check_integer("h_shape[0]", dhy_shape[0], dcy_shape[0], Rel.EQ, self.name)
+        validator.check_integer("h_shape[1]", dhy_shape[1], dcy_shape[1], Rel.EQ, self.name)
+        validator.check_integer("h_shape[2]", dhy_shape[2], dcy_shape[2], Rel.EQ, self.name)
+
+        validator.check_integer("h_shape[0]", dhy_shape[0], self.num_layers * self.num_directions, Rel.EQ, self.name)
+        validator.check_integer("h_shape[2]", dhy_shape[2], self.hidden_size, Rel.EQ, self.name)
+
+        # dy: (seq_len, batch_size, hidden_size * num_directions)
+        validator.check_integer("dy_shape", len(dy_shape), 3, Rel.EQ, self.name)
+        validator.check_integer("dy[1]", dy_shape[1], dhy_shape[1], Rel.EQ, self.name)
+        validator.check_integer("dy[2]", dy_shape[2], self.hidden_size * self.num_directions, Rel.EQ, self.name)
+
+        # (seq_len, batch_size, input_size)
+        dx_shape = (y_shape[0], y_shape[1], self.input_size)
+        dhx_shape = dhy_shape
+        dcx_shape = dcy_shape
+        weight_size = 0
+        gate_size = 4 * self.hidden_size
+        for layer in range(self.num_layers):
+            for _ in range(self.num_directions):
+                input_layer_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions
+                weight_size += gate_size * input_layer_size
+                weight_size += gate_size * self.hidden_size
+                if self.has_bias:
+                    weight_size += gate_size
+
+        return (dx_shape, dhx_shape, dcx_shape, (weight_size, 1, 1))
+
+    def infer_dtype(self, x_dtype, hx_dtype, cx_dtype, w_dtype, y_dtype, hy_dtype, cy_dtype, dy_dtype, dhy_dtype,
+                    dcy_dtype, reserve_dtype):
+        return (dy_dtype, dy_dtype, dy_dtype, hx_dtype)
+
+
 class PReLUGrad(PrimitiveWithInfer):
    r"""
    Gradients of PReLU operation.
@ -1051,6 +1108,7 @@ class RefToEmbed(Primitive):
    __mindspore_signature__ = (
        ('variable', sig_rw.RW_REF, sig_kind.KIND_POSITIONAL_KEYWORD),
    )
+
    @prim_attr_register
    def __init__(self):
        pass
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@ -35,9 +35,11 @@ def _check_positive_int_or_tuple(arg_name, arg_value, prim_name, allow_four=Fals
    """
    Checks whether an argument is a positive int or tuple with 2 or 4(when allow_four is True) positive int elements.
    """
+
    def _raise_message():
        raise ValueError(f"For '{prim_name}' attr '{arg_name}' should be an positive int number or a tuple of two "
                         f"{'or four ' if allow_four else ''}positive int numbers, but got {arg_value}")
+
    def _get_return_value():
        if isinstance(arg_value, int):
            ret = (1, 1, arg_value, arg_value) if ret_four else (arg_value, arg_value)
@ -50,6 +52,7 @@ def _check_positive_int_or_tuple(arg_name, arg_value, prim_name, allow_four=Fals
        else:
            _raise_message()
        return ret
+
    validator.check_value_type(arg_name, arg_value, (int, tuple), prim_name)
    ret_value = _get_return_value()
    for item in ret_value:
@ -58,6 +61,7 @@ def _check_positive_int_or_tuple(arg_name, arg_value, prim_name, allow_four=Fals
        _raise_message()
    return ret_value

+
 class Flatten(PrimitiveWithInfer):
    r"""
    Flattens a tensor without changing its batch size on the 0-th axis.
@ -205,6 +209,7 @@ class Softplus(PrimitiveWithInfer):
        >>> softplus(input_x)
        [1.3132615, 2.126928, 3.0485873, 4.01815, 5.0067153]
    """
+
    @prim_attr_register
    def __init__(self):
        """init Softplus"""
@ -301,6 +306,7 @@ class ReLUV2(PrimitiveWithInfer):
        ([[[[1., 0.], [0., 4.]], [[0., 6.], [7., 0.]]]],
         [[[[1, 0], [2, 0]], [[2, 0], [1, 0]]]])
    """
+
    @prim_attr_register
    def __init__(self):
        """init ReLUV2"""
@ -398,6 +404,7 @@ class HSwish(PrimitiveWithInfer):
        >>> input_x = Tensor(np.array([-1, -2, 0, 2, 1]), mindspore.float16)
        >>> result = hswish(input_x)
    """
+
    @prim_attr_register
    def __init__(self):
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
@ -1077,6 +1084,7 @@ class MaxPoolWithArgmax(_Pool):
        >>> maxpool_arg_op = P.MaxPoolWithArgmax(padding="VALID", ksize=2, strides=1)
        >>> output_tensor, argmax = maxpool_arg_op(input_tensor)
    """
+
    def __init__(self, ksize=1, strides=1, padding="valid"):
        super(MaxPoolWithArgmax, self).__init__(ksize, strides, padding)
        self.is_tbe = context.get_context("device_target") == "Ascend"
@ -1495,6 +1503,7 @@ class ApplyMomentum(PrimitiveWithInfer):
        ('gradient', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD),
        ('momentum', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD)
    )
+
    @prim_attr_register
    def __init__(self, use_nesterov=False, use_locking=False, gradient_scale=1.0):
        self.init_prim_io_names(inputs=['variable', 'accumulation', 'learning_rate', 'gradient', 'momentum'],
@ -1584,6 +1593,7 @@ class L2Loss(PrimitiveWithInfer):
        >>> l2_loss(input_x)
        7.0
    """
+
    @prim_attr_register
    def __init__(self):
        """init L2Loss"""
@ -2326,7 +2336,29 @@ class LSTM(PrimitiveWithInfer):
        y_shape = (x_shape[0], x_shape[1], self.hidden_size * self.num_directions)

        # set arbitrary shape for reserved space
-        reserved_shape = (1, 1)
+        type_size = 4
+        gates_ws_ld = self.get_good_ld(self.hidden_size * 4, type_size)
+        states_ws_ld = self.get_good_ld(max(self.hidden_size, self.input_size), type_size)
+        self.ws_gates_size = self.num_layers * self.num_directions * x_shape[0] * x_shape[1] * gates_ws_ld * type_size
+        self.ws_states_size = (self.num_layers + 1) * self.num_directions * (x_shape[0] + 1) * x_shape[
+            1] * states_ws_ld * type_size
+        self.ws_c_states_size = (self.num_layers + 1) * self.num_directions * (x_shape[0] + 1) * x_shape[
+            1] * states_ws_ld * type_size
+        self.ws_diff_states_size = (self.num_layers + 1) * self.num_directions * (x_shape[0] + 1) * (2 + 1) * x_shape[
+            1] * states_ws_ld * type_size
+        self.ws_grid_comp_size = 0
+        self.page_size = 4096
+        current_offset = 0
+        current_offset += self.ws_gates_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_states_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_c_states_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_diff_states_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_grid_comp_size
+        reserved_shape = (current_offset, 1)
        state_shape = (1, 1)
        return (y_shape, h_shape, c_shape, reserved_shape, state_shape)

@ -2335,6 +2367,15 @@ class LSTM(PrimitiveWithInfer):
        validator.check_tensor_type_same(args, (mstype.float32, mstype.float16), self.name)
        return (x_dtype, x_dtype, x_dtype, x_dtype, x_dtype)

+    def rnd_up(self, current_offset, page_size):
+        return ((current_offset + page_size - 1) // page_size) * page_size
+
+    def get_good_ld(self, dim, type_size):
+        ld = self.rnd_up(dim, 64 // type_size)
+        if ld * 256 == 0:
+            return ld + 64 // type_size
+        return ld
+

 class SigmoidCrossEntropyWithLogits(PrimitiveWithInfer):
    r"""
@ -3000,6 +3041,7 @@ class Dropout(PrimitiveWithInfer):
        >>> in = Tensor((20, 16, 50, 50))
        >>> out = dropout(in)
    """
+
    @prim_attr_register
    def __init__(self, drop_prob=0):
        self.drop_prob = validator.check_number_range("drop_prob", drop_prob, 0, 1, Rel.INC_BOTH, self.name)
@ -3034,6 +3076,7 @@ class DropoutGrad(PrimitiveWithInfer):
        >>> in = Tensor((20, 16, 50, 50))
        >>> out = dropout_grad(in)
    """
+
    @prim_attr_register
    def __init__(self, drop_prob=0):
        self.drop_prob = validator.check_number_range("drop_prob", drop_prob, 0, 1, Rel.INC_BOTH, self.name)
@ -3084,6 +3127,7 @@ class CTCLoss(PrimitiveWithInfer):
        >>> ctc_loss = P.CTCloss()
        >>> output = ctc_loss(inputs, labels_indices, labels_values, sequence_length)
    """
+
    @prim_attr_register
    def __init__(self, preprocess_collapse_repeated=False, ctc_merge_repeated=False,
                 ignore_longer_outputs_than_inputs=False):
--- a/tests/st/ops/cpu/test_lstm_op.py
+++ b/tests/st/ops/cpu/test_lstm_op.py