From dd7a79158b17f3613ff66b9c4db7691074fb6218 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 21 Aug 2018 19:54:10 +0800
Subject: [PATCH 01/21] add scope info in graphviz debug

---
 .../fluid/framework/details/multi_devices_graph_print_pass.cc  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 69944a42b6..361c91dc78 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -54,7 +54,8 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
       sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
            << "\\n"
            << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
+           << "scope: " << var_handle_ptr->scope_idx_ << "\\n"
+           << "v" << var_handle_ptr->version_ << "\"]" << std::endl;
     } else if (dummy_ptr) {
       sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
     }

From f72ab8961e443c030a50f373f21eceac3800f528 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 11:43:20 +0800
Subject: [PATCH 02/21] refine blas gemm

---
 CMakeLists.txt                           |   6 --
 paddle/fluid/operators/math/blas.h       |   9 ++
 paddle/fluid/operators/math/blas_impl.h  | 116 +++++++++++------------
 paddle/fluid/operators/math/fc_compute.h |  22 +++--
 4 files changed, 77 insertions(+), 76 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48e52961a9..317f7f9eb4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,12 +138,6 @@ else()
     set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 
-if(WITH_MKL)
-  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
-  if (MKL_SPLIT_GEMM)
-    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
-  endif()
-endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
     if (WITH_MKL AND AVX2_FOUND)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 8dcf7c99f3..295431347a 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -90,6 +90,11 @@ class Blas {
   void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
             int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
 
+  template <typename T>
+  void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+            T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C,
+            int ldc) const;
+
 #ifdef PADDLE_WITH_MKLML
   template <typename T>
   T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
@@ -109,6 +114,10 @@ class Blas {
   void GEMM_FREE(T* data) const;
 #endif
 
+  template <typename T>
+  void MatMul(const int M, const int N, const int K, const T* A, const T* B,
+              T* C) const;
+
   template <typename T>
   void MatMul(const framework::Tensor& mat_a, bool trans_a,
               const framework::Tensor& mat_b, bool trans_b, T alpha,
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index dc77b6d793..d39a3e7f6e 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -217,64 +217,6 @@ struct CBlas<platform::float16> {
 #endif
 };
 
-template <typename T>
-inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
-                    bool transb, const T &alpha, const T &beta) {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom
-  constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-  if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
-      std::abs<T>(alpha - static_cast<T>(1) >
-                  std::numeric_limits<T>::epsilon()) ||
-      std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
-    return false;
-  } else {
-    return true;
-  }
-#endif
-  return false;
-}
-
-template <>
-inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
-                                       bool transa, bool transb,
-                                       const platform::float16 &alpha,
-                                       const platform::float16 &beta) {
-  return false;
-}
-
-template <typename T>
-inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
-                      CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
-                      const T *A, int lda, const T *B, int ldb, T beta, T *C,
-                      int ldc) {
-#ifdef PADDLE_WITH_LIBXSMM
-  if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
-                 beta)) {
-    // Note: SMM use ColMajor
-    const char transa = 'N';
-    const char transb = 'N';
-    CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
-                       &beta, C, &ldc);
-    return;
-  }
-#endif
-
-#ifdef PADDLE_MKL_SPLIT_GEMM
-  constexpr int bs = 2;
-  if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
-    for (int off = 0; off < M; off += bs) {
-      CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
-                     A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
-    }
-    return;
-  }
-#endif
-  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-                 beta, C, ldc);
-}
-
 #ifdef PADDLE_WITH_MKLML
 template <>
 template <typename T>
@@ -319,8 +261,8 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
-  GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-               beta, C, ldc);
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
 }
 
 template <>
@@ -329,9 +271,20 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                                             int N, int K, T alpha, const T *A,
                                             int lda, const T *B, int ldb,
                                             T beta, T *C, int ldc) const {
-  GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-               lda, B, ldb, beta, C, ldc);
+  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                 lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                            CBLAS_TRANSPOSE transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            int lda, const T *B, int ldb,
+                                            T beta, T *C, int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
 }
 
 template <typename DeviceContext>
@@ -440,6 +393,43 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }
 
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
+                                 const T *A, const T *B, T *C) const {
+  this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                         static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
+                         N);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
+                                              const int K, const T *A,
+                                              const T *B, T *C) const {
+#ifdef PADDLE_WITH_LIBXSMM
+  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
+  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
+
+  // Since the matrix is very small,
+  // so the unit of calculation is already very fast,
+  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
+  // use xsmm directly.
+  // Note: SMM use ColMajor
+  const char transa = 'N';
+  const char transb = 'N';
+  const T alpha = static_cast<T>(1);
+  const T beta = static_cast<T>(0);
+  CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
+                     C, &N);
+  return;
+
+#endif
+
+  CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                 static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
+}
+
 template <typename DeviceContext>
 template <typename T>
 void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index 8600fa9e2c..1f5a49c0ab 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -25,17 +25,25 @@ namespace math {
 template <typename DeviceContext, typename T>
 inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
                       const int N, const int K, const T* X, const T* W, T* Y,
-                      const T* B = NULL) {
-  blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), X, W,
-            static_cast<T>(0), Y);
-  if (B) {
+                      const T* B = NULL, bool relu = false) {
+  blas.MatMul(M, N, K, X, W, Y);
+  if (B == NULL) {
+    return;
+  }
+
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
 #endif
-    for (int i = 0; i < M; i++) {
-      blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
-    }
+  for (int i = 0; i < M; i++) {
+    blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
   }
+
+  if (!relu) {
+    return;
+  }
+
+  // TODO(TJ): fuse relu
+  LOG(FATAL) << "Not implemented!";
 }
 
 }  // namespace math

From a2203d0466462fcde20bdd80d79a0f7964760eb8 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 12:08:31 +0800
Subject: [PATCH 03/21] add cblas dot

---
 paddle/fluid/operators/math/blas.h      |  3 +++
 paddle/fluid/operators/math/blas_impl.h | 27 ++++++++++++++++++++++++-
 paddle/fluid/platform/dynload/mklml.h   |  2 ++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 295431347a..96d481f739 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -153,6 +153,9 @@ class Blas {
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
 
+  template <typename T>
+  T DOT(int n, const T* x, const T* y) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index d39a3e7f6e..bbd9d4b60a 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -73,6 +73,11 @@ struct CBlas<float> {
     platform::dynload::cblas_sgemv(args...);
   }
 
+  template <typename... ARGS>
+  static float DOT(ARGS... args) {
+    return platform::dynload::cblas_sdot(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_sgemm_batch(args...);
@@ -138,6 +143,11 @@ struct CBlas<double> {
     platform::dynload::cblas_dgemv(args...);
   }
 
+  template <typename... ARGS>
+  static double DOT(ARGS... args) {
+    return platform::dynload::cblas_ddot(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_dgemm_batch(args...);
@@ -210,6 +220,7 @@ struct CBlas<platform::float16> {
     PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
   }
   static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
+  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -352,6 +363,21 @@ void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
 #endif
 }
 
+template <>
+template <typename T>
+T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  return CBlas<T>::DOT(n, x, y);
+#else
+  // try to find if openblas support cblas_dot
+  T sum = 0;
+  for (int i = 0; i < n; ++i) {
+    sum += x[i] * y[i];
+  }
+  return sum;
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
@@ -423,7 +449,6 @@ void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
   CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
                      C, &N);
   return;
-
 #endif
 
   CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 15ad4a3b40..6efa160df0 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -66,6 +66,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_dgemm_free);        \
   __macro(cblas_sgemm_batch);       \
   __macro(cblas_dgemm_batch);       \
+  __macro(cblas_sdot);              \
+  __macro(cblas_ddot);              \
   __macro(vsAdd);                   \
   __macro(vdAdd);                   \
   __macro(vsMul);                   \

From 0ec1f65cf110ee4e73a7bfa03456b52111426288 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 12:47:10 +0800
Subject: [PATCH 04/21] fix blas dot and add cblas scal

---
 paddle/fluid/operators/math/blas.h      |  3 +++
 paddle/fluid/operators/math/blas_impl.h | 27 ++++++++++++++++++++++++-
 paddle/fluid/platform/dynload/mklml.h   |  2 ++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 96d481f739..fc02534a69 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -156,6 +156,9 @@ class Blas {
   template <typename T>
   T DOT(int n, const T* x, const T* y) const;
 
+  template <typename T>
+  void SCAL(int n, const T a, const T* x) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index bbd9d4b60a..b7c56e8df1 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -78,6 +78,11 @@ struct CBlas<float> {
     return platform::dynload::cblas_sdot(args...);
   }
 
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_sscal(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_sgemm_batch(args...);
@@ -148,6 +153,11 @@ struct CBlas<double> {
     return platform::dynload::cblas_ddot(args...);
   }
 
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_dscal(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_dgemm_batch(args...);
@@ -221,6 +231,7 @@ struct CBlas<platform::float16> {
   }
   static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
   static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
+  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -367,7 +378,7 @@ template <>
 template <typename T>
 T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
 #ifdef PADDLE_WITH_MKLML
-  return CBlas<T>::DOT(n, x, y);
+  return CBlas<T>::DOT(n, x, 1, y, 1);
 #else
   // try to find if openblas support cblas_dot
   T sum = 0;
@@ -378,6 +389,20 @@ T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a,
+                                            const T *x) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::SCAL(n, a, x, 1);
+#else
+  // try to find if openblas support cblas_scal
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 6efa160df0..e50ea6740a 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -68,6 +68,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_dgemm_batch);       \
   __macro(cblas_sdot);              \
   __macro(cblas_ddot);              \
+  __macro(cblas_sscal);             \
+  __macro(cblas_dscal);             \
   __macro(vsAdd);                   \
   __macro(vdAdd);                   \
   __macro(vsMul);                   \

From 3dd66390b2702fe3083fee5e84f2ad6d5322b76b Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 13:13:58 +0800
Subject: [PATCH 05/21] add blas vexp

---
 paddle/fluid/operators/math/blas.h      |  3 +++
 paddle/fluid/operators/math/blas_impl.h | 24 ++++++++++++++++++++++++
 paddle/fluid/platform/dynload/mklml.h   |  2 ++
 3 files changed, 29 insertions(+)

diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index fc02534a69..5aba170221 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -149,6 +149,9 @@ class Blas {
   template <typename T>
   void VCOPY(int n, const T* x, T* y) const;
 
+  template <typename T>
+  void VEXP(int n, const T* x, T* y) const;
+
   template <typename T>
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index b7c56e8df1..eaad83ba18 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -97,6 +97,11 @@ struct CBlas<float> {
   static void VMUL(ARGS... args) {
     platform::dynload::vsMul(args...);
   }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vsExp(args...);
+  }
 };
 
 template <>
@@ -172,6 +177,11 @@ struct CBlas<double> {
   static void VMUL(ARGS... args) {
     platform::dynload::vdMul(args...);
   }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vdExp(args...);
+  }
 };
 
 #else
@@ -230,6 +240,7 @@ struct CBlas<platform::float16> {
     PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
   }
   static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
+  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
   static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
   static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
@@ -374,6 +385,19 @@ void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VEXP(n, x, y);
+#else
+  // try to find if openblas support vexp
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+#endif
+}
+
 template <>
 template <typename T>
 T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index e50ea6740a..aa20553cef 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -74,6 +74,8 @@ extern void* mklml_dso_handle;
   __macro(vdAdd);                   \
   __macro(vsMul);                   \
   __macro(vdMul);                   \
+  __macro(vsExp);                   \
+  __macro(vdExp);                   \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);

From 9affc36c89c2df4e26d00b1a081db0eabfd8e4fe Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 20 Aug 2018 21:15:39 +0800
Subject: [PATCH 06/21] init attention lstm

---
 paddle/fluid/operators/attention_lstm_op.cc | 354 ++++++++++++++++++++
 paddle/fluid/operators/attention_lstm_op.h  |  42 +++
 2 files changed, 396 insertions(+)
 create mode 100644 paddle/fluid/operators/attention_lstm_op.cc
 create mode 100644 paddle/fluid/operators/attention_lstm_op.h

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
new file mode 100644
index 0000000000..087df06ad5
--- /dev/null
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/attention_lstm_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
+                 "Input(WeightX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Input(WeightH) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                 "Input(Bias) of LSTM should not be null.");
+
+  PADDLE_ENFORCE(ctx->HasOutput("XX"),
+                 "Output(XX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Output(Hidden) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Output(Cell) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
+                   "Input(Cell) and Input(Hidden) of LSTM should not "
+                   "be null at the same time.");
+    auto h_dims = ctx->GetInputDim("H0");
+    auto c_dims = ctx->GetInputDim("C0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto wx_dims = ctx->GetInputDim("WeightX");
+  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
+                    "The rank of Input(WeightX) should be 2.");
+  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+                    "The first dimension of Input(WeightX) "
+                    "should be %d.",
+                    x_dims[1]);
+
+  int frame_size = wx_dims[1] / 4;
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 4 * %d.",
+                    frame_size);
+
+  auto b_dims = ctx->GetInputDim("Bias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    "The first dimension of Input(Bias) should be 1.");
+
+  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
+                 "Do not support peephole yet.");
+  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                    "The second dimension of Input(Bias) should be "
+                    "4 * %d if disable peepholes connection",
+                    frame_size);
+
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchCellPreAct", out_dims);
+  ctx->ShareLoD("X", "Hidden");
+  ctx->ShareLoD("X", "Cell");
+
+  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("X", "XX");
+}
+
+framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void FusionLSTMOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("WeightX",
+           "(Tensor) the learnable weights of X."
+           " - The shape is (M x 4D), where M is the dim size of x, D is the "
+           "hidden size. "
+           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
+  AddInput("WeightH",
+           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
+           " - The shape is (D x 4D), where D is the hidden size. "
+           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput("Bias",
+           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "Note: we should add the fc bias into this (1x4D) in bias."
+           "input-hidden bias weight and peephole connections weight if "
+           "setting `use_peepholes` True. "
+           "1. `use_peepholes = False` "
+           " - The shape is (1 x 4D). "
+           " - Bias = {b_c, b_i, b_f, b_o}."
+           "2. `use_peepholes = True` "
+           " - The shape is (1 x 7D). "
+           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+  AddInput("H0",
+           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the hidden size.")
+      .AsDispensable();
+  AddInput("C0",
+           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size. `H0` and `C0` can be NULL but only at the same time.")
+      .AsDispensable();
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
+  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
+      .AsIntermediate();
+  AddAttr<bool>("use_peepholes",
+                "(bool, defalut: True) "
+                "whether to enable diagonal/peephole connections.")
+      .SetDefault(true);
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed LSTM.")
+      .SetDefault(false);
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Long-Short Term Memory (LSTM) Operator.
+This operator fuse the X into LSTM, more details can refer to LSTM op.
+)DOC");
+}
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  // TODO(TJ): check mem copy perf
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class FuisonLSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* wx = ctx.Input<Tensor>("WeightX");
+    auto* wh = ctx.Input<Tensor>("WeightH");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* xx = ctx.Output<LoDTensor>("XX");
+    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    const T* x_data = x->data<T>();
+    const T* wx_data = wx->data<T>();
+    auto x_dims = x->dims();
+    auto wx_dims = wx->dims();
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (x_dims[1] > wx_dims[1]) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        x_data, wx_data, xx_data,
+                                        bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    } else {
+      to_batch(dev_ctx, *x, xx, true, is_reverse);
+      batched_gate->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        xx_data, wx_data, batched_gate_data,
+                                        bias->data<T>());
+    }
+
+    int frame_size = static_cast<int>(wx_dims[1] / 4);
+    framework::DDim out_dims({x_dims[0], frame_size});
+    math::LstmMetaValue<T> lstm_value;
+    // no peephole
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+    lstm_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+
+    framework::Vector<size_t> order(batched_gate->lod()[2]);
+
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
+                                         true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_hidden, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    auto batch_starts = batched_gate->lod()[0];
+    size_t max_seq_len = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    for (size_t n = 0; n < max_seq_len; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batched_gate->Slice(bstart, bend);
+      Tensor out_t = batch_hidden.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // TODO(TJ): move h0 outside for
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
+                    static_cast<T>(1.0));
+      }
+
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
+          cand_act);
+      lstm_value.prev_state_value = lstm_value.state_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden.set_lod(batched_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(dev_ctx, batch_hidden, hidden_out);
+
+    batch_cell.set_lod(batched_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(dev_ctx, batch_cell, cell_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(
+    fusion_lstm,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
new file mode 100644
index 0000000000..39dc09b4d1
--- /dev/null
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+// #include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle

From 508548f897028bb93847f33705a30c4765fe0181 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 00:17:23 +0800
Subject: [PATCH 07/21] implement attention lstm cpu forward

---
 paddle/fluid/operators/attention_lstm_op.cc | 466 ++++++++++++--------
 paddle/fluid/operators/attention_lstm_op.h  |   5 +-
 paddle/fluid/operators/fusion_lstm_op.h     |   1 -
 3 files changed, 278 insertions(+), 194 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 087df06ad5..178a1c19a9 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -20,10 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
+#include "paddle/fluid/operators/math/cpu_vec.h"
+
 namespace paddle {
 namespace operators {
 
-void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
   PADDLE_ENFORCE(ctx->HasInput("WeightX"),
                  "Input(WeightX) of LSTM should not be null.");
@@ -57,6 +59,9 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                    "should be the same.");
   }
 
+  // fc_out , shape (maxseqlen,1)
+  int max_seq_len = 0;
+
   auto wx_dims = ctx->GetInputDim("WeightX");
   PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
                     "The rank of Input(WeightX) should be 2.");
@@ -103,241 +108,321 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   ctx->ShareLoD("X", "XX");
 }
 
-framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
+framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
       ctx.device_context());
 }
 
-void FusionLSTMOpMaker::Make() {
+void AttentionLSTMOpMaker::Make() {
   AddInput("X",
            "(LoDTensor) the input is a LodTensor, which support "
            "variable-time length input sequence. The underlying tensor in "
            "this LoDTensor is a matrix with shape (T X M), where T is the "
            "total time steps in this mini-batch, M is the dim size of x.");
-  AddInput("WeightX",
-           "(Tensor) the learnable weights of X."
-           " - The shape is (M x 4D), where M is the dim size of x, D is the "
-           "hidden size. "
-           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput("WeightH",
-           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
-           " - The shape is (D x 4D), where D is the hidden size. "
-           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-  AddInput("Bias",
-           "(Tensor) the learnable weights. Almost same as LSTMOp"
-           "Note: we should add the fc bias into this (1x4D) in bias."
-           "input-hidden bias weight and peephole connections weight if "
-           "setting `use_peepholes` True. "
-           "1. `use_peepholes = False` "
-           " - The shape is (1 x 4D). "
-           " - Bias = {b_c, b_i, b_f, b_o}."
-           "2. `use_peepholes = True` "
-           " - The shape is (1 x 7D). "
-           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+  AddInput("C0",
+           "(Tensor) LSTM C0"
+           "This is a tensor with shape (N x D), where N is the batch size, D "
+           "is the gate size."
+           "C0 is necessary because of attention.");
   AddInput("H0",
-           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size and D is the hidden size.")
+           "(Tensor, optional) LSTM H0"
+           "This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the gate size.")
       .AsDispensable();
-  AddInput("C0",
-           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size. `H0` and `C0` can be NULL but only at the same time.")
+  AddInput("AttentionWeight",
+           "(Tensor) the weights of attention fc. Always relu the fc result."
+           "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
+           "gate size of LSTM.");
+  AddInput("AttentionBias, optional",
+           "(Tensor) the bias of attention fc."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("AttentionScalar",
+           "(Tensor, optional) the scalar on the result of attentioned fc. "
+           "Always relu the Scalar."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("AttentionScalarBias",
+           "(Tensor, optional) the scalar bias of attention fc."
+           "The shape is (1 x 1)")
       .AsDispensable();
+  AddInput("LSTMWeight",
+           "(Tensor) the combined weight of LSTM"
+           " - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
+           "is the dim size of x"
+           " - Weight = {W_forget, W_input, W_output, W_cell}");
+  AddInput("LSTMBias",
+           "(Tensor) the combined bias of LSTM, shape (1x4D)."
+           "Note: we should add the bias of hidden and context accorindg to "
+           "the same gate: "
+           "{B_forget, B_input, B_output, B_cell}");
   AddOutput("Hidden",
             "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
             "The shape is (T x D), and lod is the same with the `Input`.");
   AddOutput("Cell",
             "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
             "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
-            " or batched_X (size is T x M), this will be automatically chosen,"
-            " where T is the total time steps in this mini-batch,"
-            " D is the hidden size, M is the dim size of x input.")
+  AddOutput(
+      "AttentionedX",
+      "(LodTensor) shape is (T x 1), the result after X * AttentionWeight,"
+      " where T is the total time steps in this mini-batch,"
+      " D is the hidden size.")
       .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
-  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
+  AddOutput("AttentionFCOut",
+            "(Tensor) (max_seq_len, 1), compute at each step.")
       .AsIntermediate();
-  AddAttr<bool>("use_peepholes",
-                "(bool, defalut: True) "
-                "whether to enable diagonal/peephole connections.")
-      .SetDefault(true);
-  AddAttr<bool>("is_reverse",
-                "(bool, defalut: False) "
-                "whether to compute reversed LSTM.")
-      .SetDefault(false);
+  AddOutput("LSTMX",
+            "(Tensor) the input X of LSTM for each step."
+            "Shape is (1 x M), where M is the x frame size")
+      .AsIntermediate();
+  AddOutput(
+      "LSTMOUT",
+      "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
+      "Shape is (1 x 4D), where M is the x frame size")
+      .AsIntermediate();
+  // TODO(TJ): InEnum({"sigmoid", "tanh", "relu", "identity"});
   AddAttr<std::string>("gate_activation",
                        "(string, default: sigmoid)"
                        "The activation for input gate, forget gate and output "
                        "gate, `sigmoid` by default.")
       .SetDefault("sigmoid")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+      .InEnum({"sigmoid"});
   AddAttr<std::string>("cell_activation",
                        "(string, default: tanh)"
                        "The activation for cell output, `tanh` by defalut.")
       .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+      .InEnum({"tanh"});
   AddAttr<std::string>("candidate_activation",
                        "(string, default: tanh)"
                        "The activation for candidate hidden state, "
                        "`tanh` by default.")
       .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+      .InEnum({"tanh"});
   AddComment(R"DOC(
-Fusion Long-Short Term Memory (LSTM) Operator.
-This operator fuse the X into LSTM, more details can refer to LSTM op.
+Attention Long-Short Term Memory (LSTM) Operator.
+
+Attention part:
+concat( x(seqlen * M), expand( cell_t-1(1,D) ) ) => tmp(seqlen*(M+D))
+
+tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu
+
+fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu
+
+dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) 
+
+LSTM part:
+use lstm_x_t as input and compute as standard LSTM.
+
 )DOC");
 }
 
+// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
+template <typename T>
+inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
+  if (bias) {
+    for (int i = 0; i < n; ++i) {
+      y[i] = x[i] + bias[0];
+    }
+    vec_relu(n, y, y);
+  } else {
+    vec_relu(n, x, y);
+  }
+}
+
 template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  // TODO(TJ): check mem copy perf
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
+inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
+                        const T* x, T* y) {
+  T scalar = x[0];
+  // max
+  for (int i = 1; i < n; ++i) {
+    scalar = scalar < x[i] ? x[i] : scalar;
+  }
+
+  // sub
+  for (int i = 0; i < n; ++i) {
+    y[c] = x[c] - alpha;
+  }
+
+  // exp
+  blas.VEXP(n, y, y);
+
+  // sum
+  scalar = T(0);
+  for (int i = 0; i < n; ++i) {
+    scalar += y[i];
+  }
+
+  // scale
+  blas.VSCAL(n, static_cast<T>(1) / scalar, y);
+}
+
+__m256 exp(__m256 a) { return exp256_ps(a); }
+
+__m256 log(__m256 a) { return log256_ps(a); }
+
+__m256 sin(__m256 a) { return sin256_ps(a); }
+
+__m256 cos(__m256 a) { return cos256_ps(a); }
+
+__m256 relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
+
+__m256 sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
+
+__m256 tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
+
+__m256 linear(const __m256 a) { return a; }
+
+inline void vec_sigmoid(const T* x, T* y) {
+  const real min = SIGMOID_THRESHOLD_MIN;
+  const real max = SIGMOID_THRESHOLD_MAX;
+  real tmp = (a < min) ? min : ((a > max) ? max : a);
+  return 1.0 / (1.0 + exp(-tmp));
 }
 
 template <typename DeviceContext, typename T>
-class FuisonLSTMKernel : public framework::OpKernel<T> {
+class AttentionLSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    cell_out->mutable_data<T>(ctx.GetPlace());
+    auto* x = ctx.Input<LoDTensor>("X");                        // T x M
+    auto* h0 = ctx.Input<Tensor>("H0");                         // N x D
+    auto* c0 = ctx.Input<Tensor>("C0");                         // N x D
+    auto* atten_w = ctx.Input<Tensor>("AttentionWeight");       // (M+D) x 1
+    auto* atten_b = ctx.Input<Tensor>("AttentionBias");         // 1x1
+    auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");  // 1x1
+    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalar");  // 1x1
+    auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");  // (D+M) x D*4
+    auto* lstm_b = ctx.Input<Tensor>("LSTMBias");    // 1 x D*4
+
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");     // TxD
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");         // TxD
+    auto* atted_x = ctx.Output<LoDTensor>("AttentionedX");  // T x 1
+    auto* fc_out = ctx.Output<Tensor>('AttentionFCOut');    // max_seq_len x 1
+    auto* lstm_x = ctx.Output<Tensor>("LSTMX");             // 1 x M
+    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");         // 1 x 4D
 
     const T* x_data = x->data<T>();
-    const T* wx_data = wx->data<T>();
-    auto x_dims = x->dims();
-    auto wx_dims = wx->dims();
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    if (x_dims[1] > wx_dims[1]) {
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        x_data, wx_data, xx_data,
-                                        bias->data<T>());
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
-    } else {
-      to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
-                                        bias->data<T>());
-    }
-
-    int frame_size = static_cast<int>(wx_dims[1] / 4);
-    framework::DDim out_dims({x_dims[0], frame_size});
-    math::LstmMetaValue<T> lstm_value;
-    // no peephole
-    lstm_value.check_ig = nullptr;
-    lstm_value.check_fg = nullptr;
-    lstm_value.check_og = nullptr;
-    lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
-                                         true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    LoDTensor batch_hidden, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
-
-    auto batch_starts = batched_gate->lod()[0];
-    size_t max_seq_len = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    for (size_t n = 0; n < max_seq_len; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batched_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // TODO(TJ): move h0 outside for
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        Tensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
-                    static_cast<T>(1.0));
+    const T* h0_data = h0->data<T>();
+    const T* c0_data = c0->data<T>();
+    const T* lstm_w_data = lstm_w->data<T>();
+    const T* lstm_b_data = lstm_b->data<T>();
+    const T* atten_w_data = atten_w->data<T>();
+    const T* atten_b_data = atten_b ? atten_b->data<T>() : NULL;
+    const T* atten_scalar_data = atten_scalar ? atten_scalar->data<T>() : NULL;
+    const T* atten_scalar_bias_data =
+        atten_scalar_bias ? atten_scalar_bias->data<T>() : NULL;
+
+    T* hidden_out_data = hidden_out->mutable_data<T>();
+    T* cell_out_data = cell_out->mutable_data<T>();
+    T* atted_x_data = atted_x->mutable_data<T>();
+    T* fc_out_data = fc_out->mutable_data<T>();
+    T* lstm_x_data = lstm_x->mutable_data<T>();
+    T* lstm_out_data = lstm_out->mutable_data<T>();
+
+    auto x_lod = x->lod();
+    auto x_dims = x->dims();      // T x M
+    auto w_dims = w->dims();      // (D+M) x 4D
+    const int M = x_dims[1];      // x frame size
+    const int D = w_dims[1] / 4;  // gate frame size
+    const int D2 = D * 2;
+    const int D3 = D * 3;
+    const int D4 = w_dims[1];
+    const int batch_size = x_lod[0].size() - 1;  // assert lod.size() == 1
+
+    // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, T, 1, M, x_data, atten_w_data,
+                                      atted_x_data, atten_b_data);
+
+    const T* cur_x_data = x_data;
+    const T* prev_cell_data = NULL;
+    const T* prev_hidden_data = NULL;
+    T* cur_cell_out_data = cell_out_data;
+    T* cur_hidden_out_data = hidden_out_data;
+    for (int i = 0; i < batch_size; ++i) {
+      int seq_len = x_lod[0][i + 1];
+      prev_cell_data = c0_data + i * D;
+      prev_hidden_data = h0 ? h0_data + i * D : NULL;
+
+      for (int step = 0; step < seq_len; ++step) {
+        /// compute attention vector
+        // prev_cell(1xD) * fc(D) rest part of atten_wgt
+        // T  = cblas_dot();
+        T prev_cell_bias = blas.VDOT(D, prev_cell_data, atten_w_data + M);
+        // add cell bias and relu
+        bias_relu<T>(seq_len, atted_x_data, &prev_cell_bias, fc_out_data);
+        // fc2: scalar
+        if (atten_scalar_data) {
+          // x = a*x
+          blas.VSCAL(seq_len, atten_scalar_data, fc_out_data);
+          bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
+                       fc_out_data);
+        }
+        vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
+        // mul x(seq_len*M) and sum pool
+        math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
+                                          cur_x_data, lstm_x_data);
+
+        /// compute LSTM step
+        // lstm weight : concat[forget , input , output , tilde]
+        // shape : (D + M) x (4 * D)
+        // fc inputX(1xM) * weightX(M*(4D))  => 1 x 4D
+        blas.MatMul(1, D4, M, lstm_x_data, lstm_w_data + D * D4, lstm_out_data);
+        if (prev_hidden_data) {
+          blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
+                    prev_hidden_data, D, lstm_w_data, D4, static_cast<T>(1),
+                    lstm_out_data, D4);
+        }
+        // since input is 1xM, so can use add bias
+        blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data);
+
+        // gate act: sigmoid
+        vec_sigmoid(D3, lstm_out_data, lstm_out_data);
+        // candicate act: tanh
+        vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3);
+
+        // a = forget * prev_cell
+        blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
+
+        // b = input * tilde
+        blas.VMUL(D, lstm_out_data + D, lstm_out + D3, lstm_out_data + D);
+
+        // cell_out = a + b
+        blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
+
+        // state act tanh(cell_out) * output_gate
+        vec_tanh(D, cur_cell_out_data, lstm_out_data);
+        blas.VMUL(D, lstm_out_data, lstm_out + D2, cur_hidden_out_data);
+
+        prev_hidden_data = hidden_out + i * gate_size;
+        prev_cell_data = cur_cell_out_data;
+        cur_cell_out_data = cur_cell_out_data + D;
+        cur_hidden_out_data = cur_hidden_out_data + D;
       }
-
-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
-          cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
+      cur_x_data = cur_x_data + seq_len * M;
     }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batched_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(dev_ctx, batch_hidden, hidden_out);
-
-    batch_cell.set_lod(batched_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(dev_ctx, batch_cell, cell_out);
   }
 };
 
@@ -345,10 +430,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
+REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
+                  ops::AttentionLSTMOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(
-    fusion_lstm,
-    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
+    attention_lstm,
+    ops::AttentionLSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AttentionLSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
index 39dc09b4d1..6ede3a7f3c 100644
--- a/paddle/fluid/operators/attention_lstm_op.h
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-// #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -22,7 +21,7 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-class FusionLSTMOp : public framework::OperatorWithKernel {
+class AttentionLSTMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -33,7 +32,7 @@ class FusionLSTMOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override;
 };
 
-class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override;
 };
diff --git a/paddle/fluid/operators/fusion_lstm_op.h b/paddle/fluid/operators/fusion_lstm_op.h
index 39dc09b4d1..7f79601602 100644
--- a/paddle/fluid/operators/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fusion_lstm_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-// #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {

From 6ed20474d47a2577159a3799549c457e9f38f420 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 10:17:47 +0800
Subject: [PATCH 08/21] refine attention lstm infershape

---
 paddle/fluid/operators/attention_lstm_op.cc | 198 +++++++++++---------
 1 file changed, 111 insertions(+), 87 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 178a1c19a9..636deb04a1 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -26,86 +26,102 @@ namespace paddle {
 namespace operators {
 
 void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
-                 "Input(WeightX) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Input(WeightH) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                 "Input(Bias) of LSTM should not be null.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("XX"),
-                 "Output(XX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "Input(X) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("C0"),
+                 "Input(C0) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
+                 "Input(LSTMWeight) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
+                 "Input(LSTMBias) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
+                 "Input(AttentionWeight) of AttentionLSTM should not be null.");
+
   PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Output(Hidden) of LSTM should not be null.");
+                 "Output(Hidden) of AttentionLSTM should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Output(Cell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                 "Output(BatchedGate) of LSTM should not be null.");
+                 "Output(Cell) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
+                 "Output(AttentionedX) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
+                 "Output(AttentionFCOut) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
+                 "Output(LSTMX) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
+                 "Output(LSTMOUT) of AttentionLSTM should not be null.");
 
   auto x_dims = ctx->GetInputDim("X");
+  const int M = x_dims[1];
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
 
+  auto w_dims = ctx->GetInputDim("LSTMWeight");
+  const int D = w_dims[1] / 4;
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(w_dims[0], D + M,
+                    "LSTMWeight dims should be (%d + %d) * %d.", D + M, 4 * D);
+
+  auto b_dims = ctx->GetInputDim("LSTMBias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x (%d + %d).", M,
+                    D);
+  PADDLE_ENFORCE_EQ(b_dims[1], M + D, "LSTMBias dims should be 1 x (%d + %d).",
+                    M, D);
+
+  auto c_dims = ctx->GetInputDim("C0");
+  PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
   if (ctx->HasInput("H0")) {
-    PADDLE_ENFORCE(ctx->HasInput("C0"),
-                   "Input(Cell) and Input(Hidden) of LSTM should not "
-                   "be null at the same time.");
     auto h_dims = ctx->GetInputDim("H0");
-    auto c_dims = ctx->GetInputDim("C0");
     PADDLE_ENFORCE(h_dims == c_dims,
                    "The dimension of Input(H0) and Input(C0) "
                    "should be the same.");
   }
 
-  // fc_out , shape (maxseqlen,1)
-  int max_seq_len = 0;
-
-  auto wx_dims = ctx->GetInputDim("WeightX");
-  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
-                    "The rank of Input(WeightX) should be 2.");
-  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-                    "The first dimension of Input(WeightX) "
-                    "should be %d.",
-                    x_dims[1]);
-
-  int frame_size = wx_dims[1] / 4;
-  auto wh_dims = ctx->GetInputDim("WeightH");
-  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
-                    "The rank of Input(WeightH) should be 2.");
-  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
-                    "The first dimension of Input(WeightH) "
-                    "should be %d.",
-                    frame_size);
-  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
-                    "The second dimension of Input(WeightH) "
-                    "should be 4 * %d.",
-                    frame_size);
-
-  auto b_dims = ctx->GetInputDim("Bias");
-  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-  PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                    "The first dimension of Input(Bias) should be 1.");
-
-  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
-                 "Do not support peephole yet.");
-  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
-                    "The second dimension of Input(Bias) should be "
-                    "4 * %d if disable peepholes connection",
-                    frame_size);
-
-  framework::DDim out_dims({x_dims[0], frame_size});
+  auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
+  PADDLE_ENFORCE_EQ(atten_w_dims.size(), 2,
+                    "Input(AttentionWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(atten_w_dims[0], M + D,
+                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+  PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
+                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+  if (ctx->HasInput("AttentionBias")) {
+    auto atten_b_dims = ctx->GetInputDim("AttentionBias");
+    PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
+                      "Input(AttentionBias)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(atten_b_dims[0], 1,
+                      "AttentionBias shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(atten_b_dims[1], 1,
+                      "AttentionBias shapes must be 1 * 1.");
+  }
+
+  if (ctx->HasInput("AttentionScalar")) {
+    auto dims = ctx->GetInputDim("AttentionScalar");
+    PADDLE_ENFORCE_EQ(dims.size(), 2,
+                      "Input(AttentionScalar)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalar shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
+  }
+
+  if (ctx->HasInput("AttentionScalarBias")) {
+    auto dims = ctx->GetInputDim("AttentionScalarBias");
+    PADDLE_ENFORCE(
+        ctx->HasInput("AttentionScalar"),
+        "AttentionScalar should not be null when have AttentionScalarBias.");
+    PADDLE_ENFORCE_EQ(dims.size(), 2,
+                      "Input(AttentionScalarBias)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalarBias shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalarBias shapes must be 1 * 1.");
+  }
+
+  framework::DDim out_dims({x_dims[0], D});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchCellPreAct", out_dims);
+  ctx->SetOutputDim("AttentionedX", {x_dims[0], 1});
+  ctx->SetOutputDim("LSTMX", {1, M});
+  ctx->SetOutputDim("LSTMOUT", {1, 4 * D});
+  // AttentionFCOut should be reshape as (maxseqlen,1) in runtime
   ctx->ShareLoD("X", "Hidden");
   ctx->ShareLoD("X", "Cell");
-
-  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
-  ctx->ShareLoD("X", "XX");
 }
 
 framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
@@ -164,11 +180,10 @@ void AttentionLSTMOpMaker::Make() {
   AddOutput("Cell",
             "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
             "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput(
-      "AttentionedX",
-      "(LodTensor) shape is (T x 1), the result after X * AttentionWeight,"
-      " where T is the total time steps in this mini-batch,"
-      " D is the hidden size.")
+  AddOutput("AttentionedX",
+            "(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size.")
       .AsIntermediate();
   AddOutput("AttentionFCOut",
             "(Tensor) (max_seq_len, 1), compute at each step.")
@@ -316,12 +331,31 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");  // (D+M) x D*4
     auto* lstm_b = ctx.Input<Tensor>("LSTMBias");    // 1 x D*4
 
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");     // TxD
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");         // TxD
-    auto* atted_x = ctx.Output<LoDTensor>("AttentionedX");  // T x 1
-    auto* fc_out = ctx.Output<Tensor>('AttentionFCOut');    // max_seq_len x 1
-    auto* lstm_x = ctx.Output<Tensor>("LSTMX");             // 1 x M
-    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");         // 1 x 4D
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   // TxD
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");       // TxD
+    auto* atted_x = ctx.Output<Tensor>("AttentionedX");   // T x 1
+    auto* fc_out = ctx.Output<Tensor>('AttentionFCOut');  // max_seq_len x 1
+    auto* lstm_x = ctx.Output<Tensor>("LSTMX");           // 1 x M
+    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");       // 1 x 4D
+
+    // some shape should be reshape here since infershape can not get lod info
+    auto x_lod = x->lod();
+    const int N = x_lod[0].size() - 1;  // batch size
+    auto x_dims = x->dims();            // T x M
+    auto w_dims = w->dims();            // (D+M) x 4D
+    const int M = x_dims[1];            // x frame size
+    const int D = w_dims[1] / 4;        // gate frame size
+    const int D2 = D * 2;
+    const int D3 = D * 3;
+    const int D4 = w_dims[1];
+    int max_seq_len = x_lod[0][1];
+    for (int i = 1; i < N; ++i) {
+      int len = x_lod[0][i + 1] - x_lod[0][i];
+      max_seq_len = max_seq_len < len ? len : max_seq_len;
+    }
+    PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
+    PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
+    fc_out->Resize({max_seq_len, 1});
 
     const T* x_data = x->data<T>();
     const T* h0_data = h0->data<T>();
@@ -341,16 +375,6 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     T* lstm_x_data = lstm_x->mutable_data<T>();
     T* lstm_out_data = lstm_out->mutable_data<T>();
 
-    auto x_lod = x->lod();
-    auto x_dims = x->dims();      // T x M
-    auto w_dims = w->dims();      // (D+M) x 4D
-    const int M = x_dims[1];      // x frame size
-    const int D = w_dims[1] / 4;  // gate frame size
-    const int D2 = D * 2;
-    const int D3 = D * 3;
-    const int D4 = w_dims[1];
-    const int batch_size = x_lod[0].size() - 1;  // assert lod.size() == 1
-
     // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     math::FCCompute<DeviceContext, T>(blas, T, 1, M, x_data, atten_w_data,
@@ -361,7 +385,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     const T* prev_hidden_data = NULL;
     T* cur_cell_out_data = cell_out_data;
     T* cur_hidden_out_data = hidden_out_data;
-    for (int i = 0; i < batch_size; ++i) {
+    for (int i = 0; i < N; ++i) {
       int seq_len = x_lod[0][i + 1];
       prev_cell_data = c0_data + i * D;
       prev_hidden_data = h0 ? h0_data + i * D : NULL;
@@ -370,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         /// compute attention vector
         // prev_cell(1xD) * fc(D) rest part of atten_wgt
         // T  = cblas_dot();
-        T prev_cell_bias = blas.VDOT(D, prev_cell_data, atten_w_data + M);
+        T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M);
         // add cell bias and relu
         bias_relu<T>(seq_len, atted_x_data, &prev_cell_bias, fc_out_data);
         // fc2: scalar
         if (atten_scalar_data) {
           // x = a*x
-          blas.VSCAL(seq_len, atten_scalar_data, fc_out_data);
+          blas.SCAL(seq_len, atten_scalar_data, fc_out_data);
           bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
                        fc_out_data);
         }

From cf5ea925c3eea2f63b099513b85eaf5032db38fa Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 16:10:55 +0800
Subject: [PATCH 09/21] fix bugs

---
 paddle/fluid/operators/attention_lstm_op.cc | 123 +++++++++-----------
 paddle/fluid/operators/math/blas.h          |  17 ++-
 paddle/fluid/operators/math/blas_impl.h     |   3 +-
 3 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 636deb04a1..87fda12ea6 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -15,12 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-#include "paddle/fluid/operators/math/cpu_vec.h"
+// #include "paddle/fluid/operators/math/detail/activation_functions.h"
+// #include "paddle/fluid/operators/math/cpu_vec.h"
 
 namespace paddle {
 namespace operators {
@@ -233,6 +230,13 @@ use lstm_x_t as input and compute as standard LSTM.
 )DOC");
 }
 
+template <typename T>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
 // y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
@@ -240,14 +244,14 @@ inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
     for (int i = 0; i < n; ++i) {
       y[i] = x[i] + bias[0];
     }
-    vec_relu(n, y, y);
+    vec_relu<T>(n, y, y);
   } else {
-    vec_relu(n, x, y);
+    vec_relu<T>(n, x, y);
   }
 }
 
 template <typename DeviceContext, typename T>
-inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
+inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
                         const T* x, T* y) {
   T scalar = x[0];
   // max
@@ -257,7 +261,7 @@ inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
 
   // sub
   for (int i = 0; i < n; ++i) {
-    y[c] = x[c] - alpha;
+    y[i] = x[i] - scalar;
   }
 
   // exp
@@ -270,57 +274,45 @@ inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
   }
 
   // scale
-  blas.VSCAL(n, static_cast<T>(1) / scalar, y);
+  blas.SCAL(n, static_cast<T>(1) / scalar, y);
 }
 
-__m256 exp(__m256 a) { return exp256_ps(a); }
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
 
-__m256 log(__m256 a) { return log256_ps(a); }
-
-__m256 sin(__m256 a) { return sin256_ps(a); }
-
-__m256 cos(__m256 a) { return cos256_ps(a); }
-
-__m256 relu(const __m256 a) {
-  __m256 tmp = _mm256_set1_ps(0.0f);
-  return _mm256_max_ps(a, tmp);
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
 }
 
-__m256 sigmoid(const __m256 a) {
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 tmp = _mm256_max_ps(a, min);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-  tmp = exp(tmp);
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-  return tmp;
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
 }
 
-__m256 tanh(const __m256 a) {
-  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = exp(tmp);
-  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
-                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
-                       _mm256_set1_ps(1.0f));
+template <typename T>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = 1.0 / (1.0 + std::exp(-tmp));
+  }
 }
 
-__m256 linear(const __m256 a) { return a; }
-
-inline void vec_sigmoid(const T* x, T* y) {
-  const real min = SIGMOID_THRESHOLD_MIN;
-  const real max = SIGMOID_THRESHOLD_MAX;
-  real tmp = (a < min) ? min : ((a > max) ? max : a);
-  return 1.0 / (1.0 + exp(-tmp));
+template <typename T>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = tanh<T>(x[i]);
+  }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class AttentionLSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
     auto* x = ctx.Input<LoDTensor>("X");                        // T x M
     auto* h0 = ctx.Input<Tensor>("H0");                         // N x D
     auto* c0 = ctx.Input<Tensor>("C0");                         // N x D
@@ -334,7 +326,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   // TxD
     auto* cell_out = ctx.Output<LoDTensor>("Cell");       // TxD
     auto* atted_x = ctx.Output<Tensor>("AttentionedX");   // T x 1
-    auto* fc_out = ctx.Output<Tensor>('AttentionFCOut');  // max_seq_len x 1
+    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");  // max_seq_len x 1
     auto* lstm_x = ctx.Output<Tensor>("LSTMX");           // 1 x M
     auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");       // 1 x 4D
 
@@ -342,9 +334,10 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto x_lod = x->lod();
     const int N = x_lod[0].size() - 1;  // batch size
     auto x_dims = x->dims();            // T x M
-    auto w_dims = w->dims();            // (D+M) x 4D
-    const int M = x_dims[1];            // x frame size
-    const int D = w_dims[1] / 4;        // gate frame size
+    auto w_dims = lstm_w->dims();       // (D+M) x 4D
+    const int total_T = x_dims[0];
+    const int M = x_dims[1];      // x frame size
+    const int D = w_dims[1] / 4;  // gate frame size
     const int D2 = D * 2;
     const int D3 = D * 3;
     const int D4 = w_dims[1];
@@ -357,6 +350,8 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
     fc_out->Resize({max_seq_len, 1});
 
+    // TODO(TJ): act functor init here
+
     const T* x_data = x->data<T>();
     const T* h0_data = h0->data<T>();
     const T* c0_data = c0->data<T>();
@@ -368,16 +363,16 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     const T* atten_scalar_bias_data =
         atten_scalar_bias ? atten_scalar_bias->data<T>() : NULL;
 
-    T* hidden_out_data = hidden_out->mutable_data<T>();
-    T* cell_out_data = cell_out->mutable_data<T>();
-    T* atted_x_data = atted_x->mutable_data<T>();
-    T* fc_out_data = fc_out->mutable_data<T>();
-    T* lstm_x_data = lstm_x->mutable_data<T>();
-    T* lstm_out_data = lstm_out->mutable_data<T>();
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
+    T* atted_x_data = atted_x->mutable_data<T>(ctx.GetPlace());
+    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
+    T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
+    T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
 
     // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
-    math::FCCompute<DeviceContext, T>(blas, T, 1, M, x_data, atten_w_data,
+    math::FCCompute<DeviceContext, T>(blas, total_T, 1, M, x_data, atten_w_data,
                                       atted_x_data, atten_b_data);
 
     const T* cur_x_data = x_data;
@@ -400,7 +395,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         // fc2: scalar
         if (atten_scalar_data) {
           // x = a*x
-          blas.SCAL(seq_len, atten_scalar_data, fc_out_data);
+          blas.SCAL(seq_len, *atten_scalar_data, fc_out_data);
           bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
                        fc_out_data);
         }
@@ -431,16 +426,16 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
 
         // b = input * tilde
-        blas.VMUL(D, lstm_out_data + D, lstm_out + D3, lstm_out_data + D);
+        blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D);
 
         // cell_out = a + b
         blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
 
         // state act tanh(cell_out) * output_gate
         vec_tanh(D, cur_cell_out_data, lstm_out_data);
-        blas.VMUL(D, lstm_out_data, lstm_out + D2, cur_hidden_out_data);
+        blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
 
-        prev_hidden_data = hidden_out + i * gate_size;
+        prev_hidden_data = cur_hidden_out_data;
         prev_cell_data = cur_cell_out_data;
         cur_cell_out_data = cur_cell_out_data + D;
         cur_hidden_out_data = cur_hidden_out_data + D;
@@ -458,7 +453,5 @@ REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
                   ops::AttentionLSTMOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 
-REGISTER_OP_CPU_KERNEL(
-    attention_lstm,
-    ops::AttentionLSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AttentionLSTMKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel<float>,
+                       ops::AttentionLSTMKernel<double>);
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 5aba170221..da185d93c0 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -160,7 +160,7 @@ class Blas {
   T DOT(int n, const T* x, const T* y) const;
 
   template <typename T>
-  void SCAL(int n, const T a, const T* x) const;
+  void SCAL(int n, const T a, T* x) const;
 
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
@@ -233,11 +233,26 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template VCOPY<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VEXP(ARGS... args) const {
+    Base()->template VEXP<T>(args...);
+  }
+
   template <typename... ARGS>
   void GEMV(ARGS... args) const {
     Base()->template GEMV<T>(args...);
   }
 
+  template <typename... ARGS>
+  T DOT(ARGS... args) const {
+    return Base()->template DOT<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void SCAL(ARGS... args) const {
+    Base()->template SCAL<T>(args...);
+  }
+
   template <typename... ARGS>
   void BatchedGEMM(ARGS... args) const {
     Base()->template BatchedGEMM<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index eaad83ba18..e1df78d11e 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -415,8 +415,7 @@ T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
 
 template <>
 template <typename T>
-void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a,
-                                            const T *x) const {
+void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
 #ifdef PADDLE_WITH_MKLML
   CBlas<T>::SCAL(n, a, x, 1);
 #else

From ec59f0d454569ef536c9ac0f7224bc7062b110ce Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 16:40:37 +0800
Subject: [PATCH 10/21] add cpu vec

---
 paddle/fluid/operators/attention_lstm_op.cc | 56 +++-----------
 paddle/fluid/operators/math/cpu_vec.h       | 81 +++++++++++++++++++++
 paddle/fluid/platform/cpu_info.cc           |  2 +
 paddle/fluid/platform/cpu_info.h            |  1 +
 4 files changed, 95 insertions(+), 45 deletions(-)
 create mode 100644 paddle/fluid/operators/math/cpu_vec.h

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 87fda12ea6..14985a3f74 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-// #include "paddle/fluid/operators/math/detail/activation_functions.h"
-// #include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -230,13 +230,6 @@ use lstm_x_t as input and compute as standard LSTM.
 )DOC");
 }
 
-template <typename T>
-inline void vec_relu(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
 // y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
@@ -244,9 +237,9 @@ inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
     for (int i = 0; i < n; ++i) {
       y[i] = x[i] + bias[0];
     }
-    vec_relu<T>(n, y, y);
+    math::vec_relu<T>(n, y, y);
   } else {
-    vec_relu<T>(n, x, y);
+    math::vec_relu<T>(n, x, y);
   }
 }
 
@@ -277,37 +270,6 @@ inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
   blas.SCAL(n, static_cast<T>(1) / scalar, y);
 }
 
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-template <typename T>
-inline T sigmoid(T x) {
-  return 1. / (1. + exp(-x));
-}
-
-template <typename T>
-inline T tanh(T x) {
-  return 2. * sigmoid(2. * x) - 1.;
-}
-
-template <typename T>
-inline void vec_sigmoid(const int n, const T* x, T* y) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = 1.0 / (1.0 + std::exp(-tmp));
-  }
-}
-
-template <typename T>
-inline void vec_tanh(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = tanh<T>(x[i]);
-  }
-}
-
 template <typename T>
 class AttentionLSTMKernel : public framework::OpKernel<T> {
  public:
@@ -351,6 +313,10 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     fc_out->Resize({max_seq_len, 1});
 
     // TODO(TJ): act functor init here
+    // if (platform::jit::MayIUse(platform::jit::avx2)) {
+    // } else if (platform::jit::MayIUse(platform::jit::avx)) {
+    // } else {
+    // }
 
     const T* x_data = x->data<T>();
     const T* h0_data = h0->data<T>();
@@ -418,9 +384,9 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data);
 
         // gate act: sigmoid
-        vec_sigmoid(D3, lstm_out_data, lstm_out_data);
+        math::vec_sigmoid(D3, lstm_out_data, lstm_out_data);
         // candicate act: tanh
-        vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3);
+        math::vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3);
 
         // a = forget * prev_cell
         blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
@@ -432,7 +398,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
 
         // state act tanh(cell_out) * output_gate
-        vec_tanh(D, cur_cell_out_data, lstm_out_data);
+        math::vec_tanh(D, cur_cell_out_data, lstm_out_data);
         blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
 
         prev_hidden_data = cur_hidden_out_data;
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
new file mode 100644
index 0000000000..29476fce70
--- /dev/null
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = 1.0 / (1.0 + std::exp(-tmp));
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = tanh<T>(x[i]);
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
+                                                 float* y) {
+  // TODO(TJ): complete me
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
+                                                float* y) {
+  // TODO(TJ): complete me
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 7d53a684d6..79a924434b 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -112,6 +112,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
   switch (cpu_isa) {
     case sse42:
       return cpu.has(Cpu::tSSE42);
+    case avx:
+      return cpu.has(Cpu::tAVX);
     case avx2:
       return cpu.has(Cpu::tAVX2);
     case avx512_common:
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index f5f6766759..2baa21c1bd 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -43,6 +43,7 @@ namespace jit {
 typedef enum {
   isa_any,
   sse42,
+  avx,
   avx2,
   avx512_common,
   avx512_core,

From 93cc29abc09ac3d9cc85d4490f878da46431cdda Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 16:54:57 +0800
Subject: [PATCH 11/21] init attention lstm op test

---
 .../tests/unittests/test_attention_lstm_op.py | 149 ++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_attention_lstm_op.py

diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
new file mode 100644
index 0000000000..cd555a022b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -0,0 +1,149 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_fusion_lstm_op import fc, ACTIVATION
+
+
+def attention_lstm(
+        x,  # T x M
+        lod,  # 1 x N
+        h0,  # N x D
+        c0,  # N x D
+        fcws,  # (M+D) x 1, 1x1
+        fcbs,  # 1 x 1, 1x1
+        w,  # (M+D) x 4D
+        b,  # 1 x 4D
+        act_gate,
+        act_cell,
+        act_cand):
+    hidden
+    cell
+    return hidden, cell
+
+
+class TestAttentionLSTMOp(OpTest):
+    def set_conf(self):
+        self.lod = [[3]]
+
+    def setUp(self):
+        self.op_type = 'attention_lstm'
+        self.lod = [[3]]
+        self.M = 30
+        self.D = 15
+        self.has_initial_hidden = True
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+        self.set_conf()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+
+        x = np.random.normal(size=(T, self.M)).astype('float32')
+        c0 = np.random.normal(size=(bs, self.D)).astype('float32')
+        if self.has_initial_hidden:
+            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
+        else:
+            h0 = np.zeros((bs, self.D)).astype('float32')
+
+        fcw1 = np.random.normal(size=(self.M + self.D, 1)).astype('float32')
+        fcb1 = np.random.normal(size=(1, 1)).astype('float32')
+        fcw2 = np.random.normal(size=(1, 1)).astype('float32')
+        fcb2 = np.random.normal(size=(1, 1)).astype('float32')
+
+        # lstm weight and bias
+        w = np.random.normal(size=(self.M + self.D,
+                                   self.D * 4)).astype('float32')
+        b = np.random.normal(size=(1, self.D * 4)).astype('float32')
+
+        h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2],
+                              ACTIVATION[self.act_gate],
+                              ACTIVATION[self.act_cell],
+                              ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x, self.lod),
+            'C0': c0,
+            'AttentionWeight': fcw1,
+            'AttentionBias': fcb1,
+            'AttentionScalar': fcw2,
+            'AttentionScalarBias': fcb2,
+            'LSTMWeight': w,
+            'LSTMBias': b
+        }
+
+        if self.has_initial_hidden:
+            self.inputs['H0'] = h0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAttentionOpNonInit(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.has_initial_hidden = False
+
+
+class TestAttentionOpMD1(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestAttentionOpMD2(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestAttentionOpMD3(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 15
+        self.D = 30
+
+
+class TestAttentionOpBS1(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[5]]
+        self.M = 16
+        self.D = 32
+
+
+class TestAttentionOpBS2(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[3, 6]]
+
+
+class TestAttentionOpBS5(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[3, 2, 4, 7, 5]]
+
+
+if __name__ == '__main__':
+    unittest.main()

From 522b3e411f33400ae2735e81c4bc65ca26438445 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 19:40:59 +0800
Subject: [PATCH 12/21] complete attention lstm op test

---
 .../tests/unittests/test_attention_lstm_op.py | 55 ++++++++++++++++++-
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
index cd555a022b..dea6ec7668 100644
--- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 from op_test import OpTest
 from test_fusion_lstm_op import fc, ACTIVATION
+from test_softmax_op import stable_softmax
 
 
 def attention_lstm(
@@ -32,8 +33,56 @@ def attention_lstm(
         act_gate,
         act_cell,
         act_cand):
-    hidden
-    cell
+
+    T = sum(lod[0])
+    N = len(lod[0])
+    M = x.shape[1]
+    D = b.shape[1] / 4
+    assert T == x.shape[0]
+    assert len(fcws) == len(fcbs)
+
+    hidden = []
+    cell = []
+
+    start_offset = 0
+    for bid in range(N):
+        seq_len = lod[0][bid]
+        xi = np.copy(x[start_offset:seq_len, :]).reshape(seq_len, M)
+        prev_cell = np.copy(c0[bid]).reshape([1, D])
+        prev_hidden = np.copy(h0[bid]).reshape([1, D])
+        for step in range(seq_len):
+            expanded_cell = np.repeat(prev_cell, seq_len, axis=0)
+            tmp = np.concatenate((xi, expanded_cell), axis=1)
+            assert tmp.shape[1] == M + D
+            for fcid in range(len(fcbs)):
+                tmp = fc(tmp, fcws[fcid], fcbs[fcid])
+                tmp = ACTIVATION['relu'](tmp)
+            tmp = np.reshape(tmp, (1, seq_len))
+            tmp = stable_softmax(tmp).reshape(seq_len, 1)
+            lstmx = xi * tmp  # seq * M
+            lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M])
+            lstmin = np.concatenate((prev_hidden, lstmx), axis=1)
+            lstmout = np.dot(lstmin, w).reshape([1, 4 * D])
+
+            g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1)
+            g_f = act_gate(g_f).reshape([1, D])
+            g_i = act_gate(g_i).reshape([1, D])
+            g_o = act_gate(g_o).reshape([1, D])
+            cand = act_cand(cand).reshape([1, D])
+
+            cell_t = (prev_cell * g_f) + (g_i * cand)
+            hidden_t = g_o * act_cell(cell_t)
+
+            hidden.append(hidden_t.flatten())
+            cell.append(cell_t.flatten())
+
+            prev_cell = cell_t.reshape([1, D])
+            prev_hidden = hidden_t.reshape([1, D])
+
+        start_offset += seq_len
+
+    hidden = np.array(hidden).astype('float32').reshape([T, D])
+    cell = np.array(cell).astype('float32').reshape([T, D])
     return hidden, cell
 
 
@@ -73,7 +122,7 @@ class TestAttentionLSTMOp(OpTest):
         b = np.random.normal(size=(1, self.D * 4)).astype('float32')
 
         h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2],
-                              ACTIVATION[self.act_gate],
+                              w, b, ACTIVATION[self.act_gate],
                               ACTIVATION[self.act_cell],
                               ACTIVATION[self.act_cand])
 

From dd938d0b948cca5f968411704a023efc8b2971f4 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 22 Aug 2018 22:00:45 +0800
Subject: [PATCH 13/21] fix bugs and pass op test

---
 paddle/fluid/operators/attention_lstm_op.cc   | 36 +++++++++----------
 .../tests/unittests/test_attention_lstm_op.py |  9 ++---
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 14985a3f74..5d57703c0b 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -59,10 +59,8 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
   auto b_dims = ctx->GetInputDim("LSTMBias");
   PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2.");
-  PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x (%d + %d).", M,
-                    D);
-  PADDLE_ENFORCE_EQ(b_dims[1], M + D, "LSTMBias dims should be 1 x (%d + %d).",
-                    M, D);
+  PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x %d.", 4 * D);
+  PADDLE_ENFORCE_EQ(b_dims[1], 4 * D, "LSTMBias dims should be 1 x %d.", 4 * D);
 
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
@@ -148,8 +146,8 @@ void AttentionLSTMOpMaker::Make() {
            "(Tensor) the weights of attention fc. Always relu the fc result."
            "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
            "gate size of LSTM.");
-  AddInput("AttentionBias, optional",
-           "(Tensor) the bias of attention fc."
+  AddInput("AttentionBias",
+           "(Tensor, optional) the bias of attention fc."
            "The shape is (1 x 1)")
       .AsDispensable();
   AddInput("AttentionScalar",
@@ -281,7 +279,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto* atten_w = ctx.Input<Tensor>("AttentionWeight");       // (M+D) x 1
     auto* atten_b = ctx.Input<Tensor>("AttentionBias");         // 1x1
     auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");  // 1x1
-    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalar");  // 1x1
+    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");  // 1x1
     auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");  // (D+M) x D*4
     auto* lstm_b = ctx.Input<Tensor>("LSTMBias");    // 1 x D*4
 
@@ -319,7 +317,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     // }
 
     const T* x_data = x->data<T>();
-    const T* h0_data = h0->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : NULL;
     const T* c0_data = c0->data<T>();
     const T* lstm_w_data = lstm_w->data<T>();
     const T* lstm_b_data = lstm_b->data<T>();
@@ -341,36 +339,35 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     math::FCCompute<DeviceContext, T>(blas, total_T, 1, M, x_data, atten_w_data,
                                       atted_x_data, atten_b_data);
 
+    const T* cur_atten_x_data = atted_x_data;
     const T* cur_x_data = x_data;
     const T* prev_cell_data = NULL;
     const T* prev_hidden_data = NULL;
     T* cur_cell_out_data = cell_out_data;
     T* cur_hidden_out_data = hidden_out_data;
     for (int i = 0; i < N; ++i) {
-      int seq_len = x_lod[0][i + 1];
+      int seq_len = x_lod[0][i + 1] - x_lod[0][i];
       prev_cell_data = c0_data + i * D;
-      prev_hidden_data = h0 ? h0_data + i * D : NULL;
-
+      prev_hidden_data = h0_data ? h0_data + i * D : NULL;
       for (int step = 0; step < seq_len; ++step) {
-        /// compute attention vector
-        // prev_cell(1xD) * fc(D) rest part of atten_wgt
-        // T  = cblas_dot();
+        /// 1. compute attention vector
+        // 1a. prev_cell(1xD) * fc(D) rest part of atten_wgt
         T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M);
-        // add cell bias and relu
-        bias_relu<T>(seq_len, atted_x_data, &prev_cell_bias, fc_out_data);
-        // fc2: scalar
+        // 1b. add cell bias and relu
+        bias_relu<T>(seq_len, cur_atten_x_data, &prev_cell_bias, fc_out_data);
+        // 1c. fc scalar
         if (atten_scalar_data) {
-          // x = a*x
           blas.SCAL(seq_len, *atten_scalar_data, fc_out_data);
           bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
                        fc_out_data);
         }
+        // 1d. softmax
         vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
         // mul x(seq_len*M) and sum pool
         math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
                                           cur_x_data, lstm_x_data);
 
-        /// compute LSTM step
+        /// 2. compute LSTM step
         // lstm weight : concat[forget , input , output , tilde]
         // shape : (D + M) x (4 * D)
         // fc inputX(1xM) * weightX(M*(4D))  => 1 x 4D
@@ -407,6 +404,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         cur_hidden_out_data = cur_hidden_out_data + D;
       }
       cur_x_data = cur_x_data + seq_len * M;
+      cur_atten_x_data = cur_atten_x_data + seq_len;
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
index dea6ec7668..cb02c7e586 100644
--- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -40,19 +40,20 @@ def attention_lstm(
     D = b.shape[1] / 4
     assert T == x.shape[0]
     assert len(fcws) == len(fcbs)
-
     hidden = []
     cell = []
 
     start_offset = 0
     for bid in range(N):
         seq_len = lod[0][bid]
-        xi = np.copy(x[start_offset:seq_len, :]).reshape(seq_len, M)
+        xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len,
+                                                                        M)
         prev_cell = np.copy(c0[bid]).reshape([1, D])
         prev_hidden = np.copy(h0[bid]).reshape([1, D])
         for step in range(seq_len):
             expanded_cell = np.repeat(prev_cell, seq_len, axis=0)
             tmp = np.concatenate((xi, expanded_cell), axis=1)
+            assert tmp.shape[0] == seq_len
             assert tmp.shape[1] == M + D
             for fcid in range(len(fcbs)):
                 tmp = fc(tmp, fcws[fcid], fcbs[fcid])
@@ -62,7 +63,7 @@ def attention_lstm(
             lstmx = xi * tmp  # seq * M
             lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M])
             lstmin = np.concatenate((prev_hidden, lstmx), axis=1)
-            lstmout = np.dot(lstmin, w).reshape([1, 4 * D])
+            lstmout = fc(lstmin, w, b).reshape([1, 4 * D])
 
             g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1)
             g_f = act_gate(g_f).reshape([1, D])
@@ -88,7 +89,7 @@ def attention_lstm(
 
 class TestAttentionLSTMOp(OpTest):
     def set_conf(self):
-        self.lod = [[3]]
+        pass
 
     def setUp(self):
         self.op_type = 'attention_lstm'

From ba168bd2d23f763f1b4c6357943da01890fc6421 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 22 Aug 2018 12:14:26 +0000
Subject: [PATCH 14/21] modify API.spec

---
 paddle/fluid/API.spec             |  1 +
 paddle/fluid/operators/stack_op.h | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 9250cde1b2..c03df86e0f 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -162,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index b139f48d87..c777d5feae 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -154,17 +154,22 @@ class StackKernel : public framework::OpKernel<T> {
     if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
         n > kMaxThreshold) {
 #ifdef __NVCC__
+      VLOG(10) << "Stack more than " << kMaxThreshold
+               << " tensors on GPU may be slow.";
       thrust::device_vector<const T *> device_x_vec(x_datas);
       auto x_data_arr = device_x_vec.data().get();
 #else
       auto x_data_arr = x_datas.data();
 #endif
       StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+#ifdef __NVCC__
+      // Wait() must be called because device_x_vec may be destructed before
+      // kernel ends
+      dev_ctx.Wait();
+#endif
     }
 #ifdef __NVCC__
     else {  // NOLINT
-      VLOG(10) << "Stack more than " << kMaxThreshold
-               << " tensors on GPU may be slow.";
       framework::Array<const T *, kMaxThreshold> x_data_arr;
       for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i];
       StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
@@ -243,6 +248,8 @@ class StackGradKernel : public framework::OpKernel<T> {
     if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
         n > kMaxThreshold) {
 #ifdef __NVCC__
+      VLOG(10) << "Stack more than " << kMaxThreshold
+               << " tensors on GPU may be slow.";
       thrust::device_vector<T *> device_dx_vec(dx_datas);
       auto dx_data_arr = device_dx_vec.data().get();
 #else
@@ -250,11 +257,14 @@ class StackGradKernel : public framework::OpKernel<T> {
 #endif
       StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
                                post);
+#ifdef __NVCC__
+      // Wait() must be called because device_dx_vec may be destructed before
+      // kernel ends
+      dev_ctx.Wait();
+#endif
     }
 #ifdef __NVCC__
     else {  // NOLINT
-      VLOG(10) << "Stack more than " << kMaxThreshold
-               << " tensors on GPU may be slow.";
       framework::Array<T *, kMaxThreshold> dx_data_arr;
       for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i];
       StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,

From 5ca0bb9aadd50b10dc0e20bbc528604b8937e2c1 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 23 Aug 2018 00:01:45 +0800
Subject: [PATCH 15/21] support more activation type and remove some comments

---
 paddle/fluid/operators/attention_lstm_op.cc   | 57 ++++++++++---------
 paddle/fluid/operators/math/cpu_vec.h         | 26 ++++++++-
 .../tests/unittests/test_attention_lstm_op.py |  9 +++
 3 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 5d57703c0b..1cb65346ee 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
+#include <sys/time.h>
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
@@ -192,24 +193,23 @@ void AttentionLSTMOpMaker::Make() {
       "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
       "Shape is (1 x 4D), where M is the x frame size")
       .AsIntermediate();
-  // TODO(TJ): InEnum({"sigmoid", "tanh", "relu", "identity"});
   AddAttr<std::string>("gate_activation",
                        "(string, default: sigmoid)"
                        "The activation for input gate, forget gate and output "
                        "gate, `sigmoid` by default.")
       .SetDefault("sigmoid")
-      .InEnum({"sigmoid"});
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
   AddAttr<std::string>("cell_activation",
                        "(string, default: tanh)"
                        "The activation for cell output, `tanh` by defalut.")
       .SetDefault("tanh")
-      .InEnum({"tanh"});
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
   AddAttr<std::string>("candidate_activation",
                        "(string, default: tanh)"
                        "The activation for candidate hidden state, "
                        "`tanh` by default.")
       .SetDefault("tanh")
-      .InEnum({"tanh"});
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
   AddComment(R"DOC(
 Attention Long-Short Term Memory (LSTM) Operator.
 
@@ -273,22 +273,23 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");                        // T x M
-    auto* h0 = ctx.Input<Tensor>("H0");                         // N x D
-    auto* c0 = ctx.Input<Tensor>("C0");                         // N x D
-    auto* atten_w = ctx.Input<Tensor>("AttentionWeight");       // (M+D) x 1
-    auto* atten_b = ctx.Input<Tensor>("AttentionBias");         // 1x1
-    auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");  // 1x1
-    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");  // 1x1
-    auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");  // (D+M) x D*4
-    auto* lstm_b = ctx.Input<Tensor>("LSTMBias");    // 1 x D*4
-
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   // TxD
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");       // TxD
-    auto* atted_x = ctx.Output<Tensor>("AttentionedX");   // T x 1
-    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");  // max_seq_len x 1
-    auto* lstm_x = ctx.Output<Tensor>("LSTMX");           // 1 x M
-    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");       // 1 x 4D
+
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+    auto* atten_w = ctx.Input<Tensor>("AttentionWeight");
+    auto* atten_b = ctx.Input<Tensor>("AttentionBias");
+    auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");
+    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");
+    auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");
+    auto* lstm_b = ctx.Input<Tensor>("LSTMBias");
+
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    auto* atted_x = ctx.Output<Tensor>("AttentionedX");
+    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");
+    auto* lstm_x = ctx.Output<Tensor>("LSTMX");
+    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");
 
     // some shape should be reshape here since infershape can not get lod info
     auto x_lod = x->lod();
@@ -310,11 +311,11 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
     fc_out->Resize({max_seq_len, 1});
 
-    // TODO(TJ): act functor init here
-    // if (platform::jit::MayIUse(platform::jit::avx2)) {
-    // } else if (platform::jit::MayIUse(platform::jit::avx)) {
-    // } else {
-    // }
+    math::VecActivations<T> act_functor;
+    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
+    act_gate = act_functor(ctx.Attr<std::string>("gate_activation"));
+    act_cell = act_functor(ctx.Attr<std::string>("cell_activation"));
+    act_cand = act_functor(ctx.Attr<std::string>("candidate_activation"));
 
     const T* x_data = x->data<T>();
     const T* h0_data = h0 ? h0->data<T>() : NULL;
@@ -381,9 +382,9 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data);
 
         // gate act: sigmoid
-        math::vec_sigmoid(D3, lstm_out_data, lstm_out_data);
+        act_gate(D3, lstm_out_data, lstm_out_data);
         // candicate act: tanh
-        math::vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3);
+        act_cand(D, lstm_out_data + D3, lstm_out_data + D3);
 
         // a = forget * prev_cell
         blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
@@ -395,7 +396,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
         blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
 
         // state act tanh(cell_out) * output_gate
-        math::vec_tanh(D, cur_cell_out_data, lstm_out_data);
+        act_cell(D, cur_cell_out_data, lstm_out_data);
         blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
 
         prev_hidden_data = cur_hidden_out_data;
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 29476fce70..48c0da0e36 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
@@ -34,6 +34,12 @@ inline T tanh(T x) {
   return 2. * sigmoid(2. * x) - 1.;
 }
 
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_sigmoid(const int n, const T* x, T* y) {
   const T min = SIGMOID_THRESHOLD_MIN;
@@ -76,6 +82,24 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
   }
 }
 
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    PADDLE_THROW("Not support type %s.", type);
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
index cb02c7e586..a7382c2244 100644
--- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -160,6 +160,15 @@ class TestAttentionOpNonInit(TestAttentionLSTMOp):
         self.has_initial_hidden = False
 
 
+class TestAttentionOpAct(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 3
+        self.D = 2
+        self.act_gate = 'relu'
+        self.act_cell = 'tanh'
+        self.act_cand = 'sigmoid'
+
+
 class TestAttentionOpMD1(TestAttentionLSTMOp):
     def set_conf(self):
         self.M = 36

From 4e538db14d56af761d8adb8936a7f4f7435b7187 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 23 Aug 2018 00:04:03 +0800
Subject: [PATCH 16/21] refine jit space

---
 paddle/fluid/platform/cpu_info.cc | 13 ++++++++++---
 paddle/fluid/platform/cpu_info.h  |  3 ---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 79a924434b..fcd658d67c 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -103,9 +103,8 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
-#ifdef PADDLE_WITH_XBYAK
 namespace jit {
-
+#ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {
   using namespace Xbyak::util;  // NOLINT
@@ -136,8 +135,16 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
   }
   return false;
 }
+#else
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  if (cpu_isa == isa_any) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
 
 }  // namespace jit
-#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 2baa21c1bd..5d17978dd7 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -37,9 +37,7 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
-#ifdef PADDLE_WITH_XBYAK
 namespace jit {
-
 typedef enum {
   isa_any,
   sse42,
@@ -56,7 +54,6 @@ typedef enum {
 inline bool MayIUse(const cpu_isa_t cpu_isa);
 
 }  // namespace jit
-#endif
 
 }  // namespace platform
 }  // namespace paddle

From b8da70c37098beff9b5ccf3b13ac4eb6091e0f3f Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Thu, 23 Aug 2018 13:47:16 +0800
Subject: [PATCH 17/21] Resovle multi gpu async deps (#12828)

* dist transpiler add control dependency var between send and recv

* fix async deps

* follow comments and refine

* fix deps connect for rpc ops
---
 .../details/multi_devices_graph_pass.cc       | 26 ++++++++++++++++---
 paddle/fluid/framework/ir/node.cc             |  2 +-
 paddle/fluid/framework/ir/node.h              |  2 +-
 paddle/fluid/pybind/const_value.cc            |  5 +++-
 python/paddle/fluid/framework.py              |  6 +++++
 .../fluid/transpiler/distribute_transpiler.py | 18 +++++++++++--
 6 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index c5a13e7e1f..bc61b0eacb 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -763,6 +763,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
 // Create RPC related op handles that connects its in ops and out ops.
 void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                                           ir::Node *node) const {
+  // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
+  //                     put them into transpiler.
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
@@ -771,26 +773,42 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                    "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
     // split_byref op
-    // so that we can balance the variable blocks to all the pserver
-    // instances.
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
         node->inputs[0]->Name().find(".block") == std::string::npos) {
       std::vector<std::string> input_var_names;
       for (ir::Node *n : node->inputs) {
         input_var_names.push_back(n->Name());
       }
-      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      auto send_param_grad = boost::get<std::vector<std::string>>(
+          node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
+      op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
+      VLOG(10) << "send grad " << input_var_names[0] << " origin "
+               << send_param_grad[1] << " place: " << op_dev_id;
       for (auto &varname : input_var_names) {
         result->Get<ShardedVarDevice>(kShardedVarDevice)
             .emplace(varname, op_dev_id);
       }
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(send_param_grad[1], op_dev_id);
     }
   } else if (node->Op()->Type() == "recv") {
     std::vector<std::string> output_var_names;
     for (ir::Node *n : node->outputs) {
       output_var_names.push_back(n->Name());
     }
-    op_dev_id = GetAppropriateDeviceID(output_var_names);
+    auto recv_param_grad = boost::get<std::vector<std::string>>(
+        node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+    // FIXME(typhoonzero): assume each recv op output one param
+    // Use the same place as send.
+    if (recv_param_grad.size() == 2U) {
+      op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
+      VLOG(10) << "recv param " << recv_param_grad[0]
+               << " get grad place: " << recv_param_grad[1]
+               << " place: " << op_dev_id;
+    } else {
+      op_dev_id = GetAppropriateDeviceID(output_var_names);
+    }
     for (auto &varname : output_var_names) {
       result->Get<ShardedVarDevice>(kShardedVarDevice)
           .emplace(varname, op_dev_id);
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index aca77da8d6..65c45c7d20 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-const char Node::kControlDepVarName[] = "__control_var";
+constexpr char Node::kControlDepVarName[];
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 63277d2d01..aab3180e7e 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -27,7 +27,7 @@ namespace ir {
 class Node {
  public:
   enum class Type { kOperation, kVariable };
-  static const char kControlDepVarName[];
+  static constexpr char kControlDepVarName[] = "__control_var";
 
   explicit Node(const std::string& name, Type type)
       : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 9094f6051c..a81715c3b3 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
-#include <paddle/fluid/framework/op_proto_maker.h>
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -24,6 +25,8 @@ void BindConstValue(pybind11::module* m) {
   m->def("kTempVarName", [] { return framework::kTempVarName; });
   m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
   m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+  m->def("kControlDepVarName",
+         [] { return framework::ir::Node::kControlDepVarName; });
 
   auto op_proto_and_checker_maker =
       m->def_submodule("op_proto_and_checker_maker");
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 389fce1874..e0ddd3b5ff 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -50,6 +50,12 @@ EMPTY_VAR_NAME = core.kEmptyVarName()
 TEMP_VAR_NAME = core.kTempVarName()
 GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
+
+
+def generate_control_dev_var_name():
+    import random
+    return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random())
 
 
 def grad_var_name(var_name):
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 540eb8c833..80d9758b3d 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -212,8 +212,10 @@ class DistributeTranspiler(object):
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
         self.param_name_to_grad_name = dict()
+        self.grad_name_to_param_name = dict()
         for param_var, grad_var in self.params_grads:
             self.param_name_to_grad_name[param_var.name] = grad_var.name
+            self.grad_name_to_param_name[grad_var.name] = param_var.name
 
         # add distributed attrs to program
         self.origin_program._is_distributed = True
@@ -262,8 +264,10 @@ class DistributeTranspiler(object):
                 AssertionError("Can not insert the send op by original "
                                "variable name :", splited_grad_varname)
 
-            dummy_output = program.global_block().create_var()
+            dummy_output = program.global_block().create_var(
+                name=framework.generate_control_dev_var_name())
             grad_name_to_send_dummy_out[grad_varname] = dummy_output
+
             program.global_block()._insert_op(
                 index=index + 1,
                 type="send",
@@ -272,6 +276,8 @@ class DistributeTranspiler(object):
                 attrs={
                     "epmap": eplist,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    OP_ROLE_VAR_ATTR_NAME:
+                    [self.grad_name_to_param_name[grad_varname], grad_varname],
                     "sync_mode": not self.sync_mode,
                 })
             for _, var in enumerate(splited_vars):
@@ -313,6 +319,10 @@ class DistributeTranspiler(object):
                 attrs={
                     "epmap": eps,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    OP_ROLE_VAR_ATTR_NAME: [
+                        param_varname,
+                        self.param_name_to_grad_name[param_varname]
+                    ],
                     "sync_mode": not self.sync_mode
                 })
 
@@ -971,7 +981,11 @@ class DistributeTranspiler(object):
                     attrs={
                         "sync_mode": True,
                         "epmap": pserver_endpoints,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        OP_ROLE_VAR_ATTR_NAME: [
+                            self.grad_name_to_param_name[table_grad_name],
+                            table_grad_name
+                        ]
                     })
                 break
 

From 8ad90558047fcd844db270a5744c78bf772242fb Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 23 Aug 2018 14:00:58 +0800
Subject: [PATCH 18/21] Add is_test for while_op (#12874)

* add is_test for while_op

* Change API
---
 paddle/fluid/API.spec                      | 2 +-
 paddle/fluid/operators/while_op.cc         | 7 +++++++
 python/paddle/fluid/layers/control_flow.py | 7 +++++--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 9250cde1b2..bbf1623c39 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -191,7 +191,7 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
 paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 48e37796e1..65a3bc928e 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -58,11 +58,15 @@ class WhileOp : public framework::OperatorBase {
     PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
                    "Condition of while op must in CPU memory.");
 
+    bool is_test = Attr<bool>("is_test");
     auto ctx = executor.Prepare(*program, block->ID());
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
       executor.RunPreparedContext(ctx.get(), &current_scope, false);
+      if (is_test) {
+        scope.DeleteScope(&current_scope);
+      }
     }
   }
 };
@@ -88,6 +92,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
               "variables generated in the i'th step.");
     AddAttr<framework::BlockDesc *>(kStepBlock,
                                     "The step block inside WhileOp");
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddComment(R"DOC(
 )DOC");
   }
@@ -103,6 +108,8 @@ class WhileGradOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
+    PADDLE_ENFORCE(!Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 8bfe11916b..d2954c4c22 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -661,6 +661,7 @@ class While(object):
 
     Args:
         cond (Variable): condition used to compare.
+        is_test(bool): A flag indicating whether execution is in test phase.
         name (str): The name of this layer.
 
     Examples:
@@ -683,7 +684,7 @@ class While(object):
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2
 
-    def __init__(self, cond, name=None):
+    def __init__(self, cond, is_test=False, name=None):
         self.helper = LayerHelper("while", name=name)
         self.status = While.BEFORE_WHILE_BLOCK
         if not isinstance(cond, Variable):
@@ -694,6 +695,7 @@ class While(object):
         if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
             raise TypeError("condition should be a bool scalar")
         self.cond_var = cond
+        self.is_test = is_test
 
     def block(self):
         return WhileGuard(self)
@@ -735,7 +737,8 @@ class While(object):
             },
             outputs={'Out': out_vars,
                      'StepScopes': [step_scope]},
-            attrs={'sub_block': while_block})
+            attrs={'sub_block': while_block,
+                   "is_test": self.is_test})
 
 
 def lod_rank_table(x, level=0):

From 9c7fde45a7fec127e3f7dc7e1c161ec647e5683b Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 23 Aug 2018 13:32:02 +0800
Subject: [PATCH 19/21] enhance test_analyzer to profile ditu inference demo

---
 .../ir/graph_pattern_detecter_tester.cc       |  4 +-
 paddle/fluid/framework/selected_rows.cc       |  4 +-
 .../inference/analysis/analyzer_tester.cc     | 48 +++++++++++--------
 paddle/fluid/operators/sampling_id_op.h       |  2 +-
 paddle/scripts/paddle_build.sh                |  2 -
 5 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
index 993c885a81..06f9df5546 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
@@ -163,8 +163,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
   // 3. Detect op2 -> var2 -> op4
   // 4. Detect op2 -> var3 -> op5
   // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
-  ASSERT_GE(count, 1UL);
-  ASSERT_LE(count, 2UL);
+  ASSERT_GE(count, 1);
+  ASSERT_LE(count, 2);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index c202b0a5be..a4319ffabb 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -139,7 +139,7 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
     }
     auto write_iter = id_to_index_.find(key);
     if (write_iter == id_to_index_.end()) {
-      size_t row_num = rows_.size();
+      int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
         PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
@@ -182,7 +182,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
     PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                       "output tensor should have the same shape with table "
                       "except the dims[0].");
-    for (size_t i = 0; i < ids.numel(); ++i) {
+    for (int i = 0; i < ids.numel(); ++i) {
       int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
       framework::VisitDataType(
           framework::ToDataType(value_->type()),
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 52f5c4f5ae..baa7600283 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -23,6 +23,8 @@
 
 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
+DEFINE_int32(batch_size, 10, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 
 namespace paddle {
 namespace inference {
@@ -92,7 +94,7 @@ struct DataRecord {
   size_t batch_iter{0};
   size_t batch_size{1};
   DataRecord() = default;
-  DataRecord(const std::string &path, int batch_size = 1)
+  explicit DataRecord(const std::string &path, int batch_size = 1)
       : batch_size(batch_size) {
     Load(path);
   }
@@ -165,7 +167,6 @@ struct DataRecord {
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                    int batch_size) {
-  // DataRecord data(FLAGS_datapath, batch_size);
   PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
       week_tensor, minute_tensor;
   lod_attention_tensor.name = "data_lod_attention";
@@ -174,28 +175,33 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   week_tensor.name = "week";
   minute_tensor.name = "minute";
   auto one_batch = data->NextBatch();
-  // clang-format off
-  std::vector<int> rnn_link_data_shape
-      ({static_cast<int>(one_batch.rnn_link_data.size()), static_cast<int>(one_batch.rnn_link_data.front().size())});
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
   lod_attention_tensor.shape.assign({1, 2});
   lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
   init_zero_tensor.shape.assign({batch_size, 15});
   init_zero_tensor.lod.assign({one_batch.lod3});
   lod_tensor_tensor.shape = rnn_link_data_shape;
   lod_tensor_tensor.lod.assign({one_batch.lod1});
-  week_tensor.shape.assign({(int) one_batch.rnn_week_datas.size(), (int) one_batch.rnn_week_datas.front().size()});
+  // clang-format off
+  week_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
   week_tensor.lod.assign({one_batch.lod3});
-  minute_tensor.shape.assign({(int) one_batch.rnn_minute_datas.size(),
-                              (int) one_batch.rnn_minute_datas.front().size()});
+  minute_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
   minute_tensor.lod.assign({one_batch.lod3});
+  // clang-format on
   // assign data
-  TensorAssignData(&lod_attention_tensor, std::vector<std::vector<float>>({{0, 0}}));
+  TensorAssignData(&lod_attention_tensor,
+                   std::vector<std::vector<float>>({{0, 0}}));
   std::vector<float> tmp_zeros(batch_size * 15, 0.);
   TensorAssignData(&init_zero_tensor, {tmp_zeros});
   TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
   TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
   TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
-  // clang-format on
   // Set inputs.
   auto init_zero_tensor1 = init_zero_tensor;
   init_zero_tensor1.name = "hidden_init";
@@ -231,12 +237,9 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
   os << "\n";
   os << " - data: ";
 
-  // clang-format off
-  int dim = std::accumulate(tensor.shape.begin(),
-                            tensor.shape.end(),
-                            1,
-                            [](int a, int b) { return a * b; });  // clang-format on
-  for (size_t i = 0; i < dim; i++) {
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
     os << static_cast<float *>(tensor.data.data())[i] << " ";
   }
   os << '\n';
@@ -300,13 +303,16 @@ void TestDituRNNPrediction(const std::string &model_path,
   for (int i = 0; i < num_times; i++) {
     predictor->Run(input_slots, &outputs);
   }
-  LOG(INFO) << "time/batch: " << timer.toc() / num_times;
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
+            << ", latency: " << timer.toc() / num_times << "ms";
+  LOG(INFO) << "=====================================";
 
   for (auto &out : outputs) {
     size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
                                   [](int a, int b) { return a * b; });
     float *data = static_cast<float *>(out.data.data());
-    for (int i = 0;
+    for (size_t i = 0;
          i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size);
          i++) {
       EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3);
@@ -336,7 +342,7 @@ TEST(Analyzer, SupportIRPass) {
 // Directly infer with the original model.
 TEST(Analyzer, DituRNN_without_analysis) {
   TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        10, false, false);
+                        FLAGS_batch_size, false, false, FLAGS_repeat);
 }
 
 // Inference with the original model with the analysis turned on, the analysis
@@ -344,14 +350,14 @@ TEST(Analyzer, DituRNN_without_analysis) {
 TEST(Analyzer, DituRNN_with_analysis) {
   LOG(INFO) << "ditu rnn with analysis";
   TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        10, true, false, 1);
+                        FLAGS_batch_size, true, false, FLAGS_repeat);
 }
 
 // Inference with analysis and IR. The IR module will fuse some large kernels.
 TEST(Analyzer, DituRNN_with_analysis_with_IR) {
   LOG(INFO) << "ditu rnn with analysis and IR fuse";
   TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        10, true, true, 1);
+                        FLAGS_batch_size, true, true, FLAGS_repeat);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index f730a9746d..e1dd4539b3 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -54,7 +54,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
         static_cast<T>(context.Attr<float>("max")));
 
     std::vector<T> ids(batch_size);
-    for (size_t i = 0; i < batch_size; ++i) {
+    for (int i = 0; i < batch_size; ++i) {
       T r = dist(engine);
       int idx = width - 1;
       for (int j = 0; j < width; ++j) {
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8460f93b84..f2a9a6b3b9 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -116,7 +116,6 @@ function cmake_gen() {
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
 EOF
@@ -146,7 +145,6 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \
         -DPY_VERSION=${PY_VERSION:-2.7}
 }
 

From acdd95d5caf92f38a995bc6d2edf20a56520d799 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 23 Aug 2018 16:47:12 +0800
Subject: [PATCH 20/21] bug fix

---
 paddle/fluid/operators/sampling_id_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index f730a9746d..3f7860e1fa 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
           break;
         }
       }
-      ids[i] = ins_vector[i * width + idx];
+      ids[i] = ins_vector[idx];
     }
 
     std::vector<int64_t> out_dim;

From 405d6d09e1b2199711818540cfcfb87494999852 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 23 Aug 2018 19:26:28 +0800
Subject: [PATCH 21/21] Fix doc typo. (#12863)

---
 doc/fluid/dev/new_op_cn.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index c00f73be95..ff7408111f 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -36,19 +36,19 @@
 <tbody>
 <tr>
 <td>OpProtoMake定义 </td>
-<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+<td>.cc 文件，Backward Op不需要定义OpProtoMake </td>
 </tr>
 <tr>
 <td>Op定义 </td>
-<td> `.cc`文件</td>
+<td> .cc 文件</td>
 </tr>
 <tr>
 <td>Kernel实现 </td>
-<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+<td> CPU、CUDA共享Kernel实现在.h 文件中，否则，CPU 实现在.cc 文件中，CUDA 实现在.cu 文件中。</td>
 </tr>
 <tr>
 <td>注册Op </td>
-<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+<td> Op注册实现在.cc 文件；Kernel注册CPU实现在.cc 文件中，CUDA实现在.cu 文件中</td>
 </tr>
 </tbody>
 </table>
@@ -391,7 +391,7 @@ PADDLE_ENFORCE(ctx->HasInput("X"), "");
 ```
 问题示例2 ：提示信息过于简单
 ```
-PADDLE_ENFORCE(i != nullptr, "I must be set"); // I是什么？
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么？
 ```
 
 2. 在报错信息中使用开发人员定义的变量缩写，不易理解！