From 53b0e427092219b402f0ed6fab4235c3b70fdc7c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 17 Aug 2017 16:19:59 +0800
Subject: [PATCH 1/5] Add EigenGemm.

---
 paddle/function/EigenGemm.cpp   | 92 ++++++++++++++++++++++++++++++
 paddle/function/GemmFunctor.cpp | 85 ++++++++++++++++++++++++++++
 paddle/function/GemmFunctor.h   | 99 +++++++++++----------------------
 3 files changed, 211 insertions(+), 65 deletions(-)
 create mode 100644 paddle/function/EigenGemm.cpp
 create mode 100644 paddle/function/GemmFunctor.cpp
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
new file mode 100644
index 0000000000..0b4220fcbe
--- /dev/null
+++ b/paddle/function/EigenGemm.cpp
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      Matrix;
+
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC;
+    sizeC[0] = M;
+    sizeC[1] = N;
+    CHECK_EQ(N, ldc);
+
+    const Matrix a(const_cast<T*>(A), sizeA);
+    const Matrix b(const_cast<T*>(B), sizeB);
+    Matrix c(C, sizeC);
+
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+
+    Eigen::DefaultDevice device;
+    if (alpha == T(1) && beta == T(0)) {
+      c.device(device) = a.contract(b, dims);
+    } else if (alpha == T(1) && beta == T(1)) {
+      c.device(device) += a.contract(b, dims);
+    } else {
+      c.device(device) =
+          c.constant(alpha) * a.contract(b, dims) + c.constant(beta) * c;
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class EigenBlasGemm<double>;
+#else
+template class EigenBlasGemm<float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
new file mode 100644
index 0000000000..8df9b884fe
--- /dev/null
+++ b/paddle/function/GemmFunctor.cpp
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmFunctor.h"
+#include "paddle/math/MathFunctions.h"
+
+namespace paddle {
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+  }
+};
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+template class BlasGemm<DEVICE_TYPE_CPU, real>;
+template class BlasGemm<DEVICE_TYPE_GPU, real>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h
index d5db5cf5e7..0809953b4e 100644
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/MathFunctions.h"
+#include "TensorType.h"
 
 namespace paddle {
 
@@ -24,73 +24,42 @@ namespace paddle {
 // of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
 // interface.
 template <DeviceType Device, class T>
-class GemmFunctor {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc);
+struct BlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
 };
 
+// TODO(hedaoyuan): Since the definition of the real type in the Paddle
+// conflicts with the Eigen library, so compile the Eigen code can not
+// include the Paddle header file. And need an EigenBlasGemm template class
+// that does not contain the DeviceType parameter.
+// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
 template <class T>
-class GemmFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc) {
-    gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-  }
-};
-
-template <class T>
-class GemmFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
+struct EigenBlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
 };
 
 }  // namespace paddle

From ec2ba242060fc10b2045533fdcb410cfbd473cec Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 17 Aug 2017 16:22:30 +0800
Subject: [PATCH 2/5] Fix GemmConvFunction.

---
 paddle/function/CMakeLists.txt |  2 +
 paddle/function/GemmConvOp.cpp | 82 ++++++++++++++++------------------
 2 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 7dfb6f61c5..9187294a49 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -4,6 +4,8 @@ file(GLOB cpp_files . *Op.cpp)
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 list(APPEND cpp_files BufferArg.cpp)
+list(APPEND cpp_files GemmFunctor.cpp)
+list(APPEND cpp_files EigenGemm.cpp)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 0ada4d70a0..f8cf4ebea8 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -85,7 +85,6 @@ public:
     }
 
     Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
     size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
@@ -108,19 +107,19 @@ public:
         int M = outputChannels / groups_;
         int N = outputHeight * outputWidth;
         int K = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
-             CblasNoTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             filterData + g * filterOffset,
-             K,
-             colData,
-             N,
-             beta,
-             outputData + g * outputOffset,
-             N);
+        BlasGemm<Device, real>::compute(false,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        K,
+                                        colData,
+                                        N,
+                                        beta,
+                                        outputData + g * outputOffset,
+                                        N);
       }
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
@@ -188,8 +187,6 @@ public:
     }
 
     Col2ImFunctor<kCFO, Device, real> col2im;
-    GemmFunctor<Device, real> gemm;
-
     size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
@@ -205,19 +202,19 @@ public:
           colData = inputGrad + g * inputOffset;
           scale = 1.0f;
         }
-        gemm(CblasTrans,
-             CblasNoTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             filterData + g * filterOffset,
-             M,
-             outputGrad + g * outputOffset,
-             N,
-             scale,
-             colData,
-             N);
+        BlasGemm<Device, real>::compute(true,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        M,
+                                        outputGrad + g * outputOffset,
+                                        N,
+                                        scale,
+                                        colData,
+                                        N);
         if (needIm2col) {
           col2im(inputGrad + g * inputOffset,
                  imShape,
@@ -299,7 +296,6 @@ public:
     }
 
     Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
     size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
@@ -321,19 +317,19 @@ public:
         int M = outputChannels / groups_;
         int K = outputHeight * outputWidth;
         int N = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
-             CblasTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             outputGrad + g * outputOffset,
-             K,
-             colData,
-             K,
-             i == 0 ? beta : 1.0f,
-             filterGrad + g * filterOffset,
-             N);
+        BlasGemm<Device, real>::compute(false,
+                                        true,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        outputGrad + g * outputOffset,
+                                        K,
+                                        colData,
+                                        K,
+                                        i == 0 ? beta : 1.0f,
+                                        filterGrad + g * filterOffset,
+                                        N);
       }
       inputData += inputChannels * inputHeight * inputWidth;
       outputGrad += outputChannels * outputHeight * outputWidth;

From adcca2cc064182cd75809dd1e3d8c64329a0b0de Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 17 Aug 2017 16:40:38 +0800
Subject: [PATCH 3/5] Add PADDLE_USE_EIGEN_FOR_BLAS macro.

---
 CMakeLists.txt                  | 1 +
 cmake/configure.cmake           | 4 ++++
 paddle/function/GemmFunctor.cpp | 5 +++++
 3 files changed, 10 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcd1218a5b..28bbfd7916 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 209f9078a6..51c3b918cc 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 
+if(USE_EIGEN_FOR_BLAS)
+    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
+endif(USE_EIGEN_FOR_BLAS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
index 8df9b884fe..dc83278d8e 100644
--- a/paddle/function/GemmFunctor.cpp
+++ b/paddle/function/GemmFunctor.cpp
@@ -32,6 +32,10 @@ struct BlasGemm<DEVICE_TYPE_CPU, T> {
                       const T beta,
                       T* C,
                       const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
     gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
             transB == false ? CblasNoTrans : CblasTrans,
             M,
@@ -45,6 +49,7 @@ struct BlasGemm<DEVICE_TYPE_CPU, T> {
             beta,
             C,
             ldc);
+#endif
   }
 };
 

From 6ba04dcd112e0caac46a7a829182ce00f301752f Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 17 Aug 2017 16:56:46 +0800
Subject: [PATCH 4/5] Remove the header files that do not need to be included.

---
 paddle/function/DepthwiseConvOp.cpp   | 1 -
 paddle/function/DepthwiseConvOpGpu.cu | 1 -
 2 files changed, 2 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 490e8d546c..2f3112fe65 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "DepthwiseConvOp.h"
 #include "ConvOp.h"
-#include "GemmFunctor.h"
 
 namespace paddle {
 
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 33463805cb..2d722dfcfc 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "DepthwiseConvOp.h"
-#include "GemmFunctor.h"
 #include "paddle/math/BaseMatrix.h"
 
 namespace paddle {

From 430e0e418bb34d6a14662a29a3e6d5fb906c9610 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 21 Aug 2017 10:12:25 +0800
Subject: [PATCH 5/5] Follow comments.

---
 paddle/function/CMakeLists.txt | 4 +++-
 paddle/function/EigenGemm.cpp  | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 9187294a49..c572a9d433 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -5,7 +5,9 @@ list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 list(APPEND cpp_files BufferArg.cpp)
 list(APPEND cpp_files GemmFunctor.cpp)
-list(APPEND cpp_files EigenGemm.cpp)
+if(USE_EIGEN_FOR_BLAS)
+  list(APPEND cpp_files EigenGemm.cpp)
+endif(USE_EIGEN_FOR_BLAS)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
index 0b4220fcbe..674141ed39 100644
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -77,8 +77,7 @@ struct EigenBlasGemm {
     } else if (alpha == T(1) && beta == T(1)) {
       c.device(device) += a.contract(b, dims);
     } else {
-      c.device(device) =
-          c.constant(alpha) * a.contract(b, dims) + c.constant(beta) * c;
+      c.device(device) = alpha * a.contract(b, dims) + beta * c;
     }
   }
 };