From dbf1d75f57c465696c82c618d593c4470e6d44ea Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 26 Dec 2017 15:27:42 +0800
Subject: [PATCH 1/6] Add a GemmConvMobileFunction.

---
 paddle/function/GemmConvOp.cpp | 152 +++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index de7b70e271..08eb6a5490 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -134,6 +134,154 @@ public:
   }
 };
 
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    size_t colWidth = outputHeight * outputWidth;
+    // Max col matrix height 256, Max col matrix width 1024
+    size_t stepColHeight = std::min(colHeight, (size_t)256);
+    size_t stepColWidth = std::min(colWidth, (size_t)2048);
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = colWidth;
+    int kStride = colHeight;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+               colHeightStart += stepColHeight) {
+            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+                 colWidthStart += stepColWidth) {
+              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              // im2col
+              im2col(inputData + g * inputOffset,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     colHeightStart,
+                     K,
+                     colWidthStart,
+                     N);
+
+              // gemm
+              int M = outputChannels / groups_;
+              gemm(CblasNoTrans,
+                   CblasNoTrans,
+                   M,
+                   N,
+                   K,
+                   1.0f,
+                   filterData + g * filterOffset + colHeightStart,
+                   kStride,
+                   colData,
+                   N,
+                   beta_,
+                   outputData + g * outputOffset + colWidthStart,
+                   nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          gemm(CblasNoTrans,
+               CblasNoTrans,
+               M,
+               N,
+               K,
+               1.0f,
+               filterData + g * filterOffset,
+               K,
+               inputData + g * inputOffset,
+               N,
+               beta,
+               outputData + g * outputOffset,
+               N);
+        }
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
 /*
  * \brief Backward input calculation of convolution.
  */
@@ -348,7 +496,11 @@ public:
   }
 };
 
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
 #ifdef PADDLE_WITH_CUDA

From d775895e939eb9e4ce4378e349a76d56bd4af72d Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 26 Dec 2017 15:43:30 +0800
Subject: [PATCH 2/6] Add Im2ColMobileFunctor.

---
 paddle/function/GemmConvOp.cpp | 56 +++++++++++++++++-----------------
 paddle/function/Im2Col.h       | 48 +++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 08eb6a5490..75a5b4fe84 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -206,8 +206,7 @@ public:
       colData = reinterpret_cast<real*>(memory_->getBuf());
     }
 
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
+    Im2ColMobileFunctor<real> im2col;
     size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
@@ -241,19 +240,20 @@ public:
 
               // gemm
               int M = outputChannels / groups_;
-              gemm(CblasNoTrans,
-                   CblasNoTrans,
-                   M,
-                   N,
-                   K,
-                   1.0f,
-                   filterData + g * filterOffset + colHeightStart,
-                   kStride,
-                   colData,
-                   N,
-                   beta_,
-                   outputData + g * outputOffset + colWidthStart,
-                   nStride);
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + g * filterOffset + colHeightStart,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + g * outputOffset + colWidthStart,
+                  nStride);
             }
             beta_ = 1.0;
           }
@@ -261,19 +261,19 @@ public:
           int M = outputChannels / groups_;
           int N = outputHeight * outputWidth;
           int K = inputChannels / groups_ * filterHeight * filterWidth;
-          gemm(CblasNoTrans,
-               CblasNoTrans,
-               M,
-               N,
-               K,
-               1.0f,
-               filterData + g * filterOffset,
-               K,
-               inputData + g * inputOffset,
-               N,
-               beta,
-               outputData + g * outputOffset,
-               N);
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData + g * filterOffset,
+                                          K,
+                                          inputData + g * inputOffset,
+                                          N,
+                                          beta,
+                                          outputData + g * outputOffset,
+                                          N);
         }
       }
       inputData += inputChannels * inputHeight * inputWidth;
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 0c37fc9724..f43ca465a2 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,4 +98,52 @@ public:
                   int dilationWidth = 1);
 };
 
+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int colHeightStart,
+                  int colHeightSize,
+                  int colWidthStart,
+                  int colWidthSize) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int colh = 0; colh < colHeightSize; colh++) {
+      int wOffset = (colHeightStart + colh) % filterWidth;
+      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+
+      for (int colw = 0; colw < colWidthSize; colw++) {
+        int h = (colWidthStart + colw) / outputWidth;
+        int w = (colWidthStart + colw) % outputWidth;
+
+        int imRowIdx = h * strideHeight + hOffset;
+        int imColIdx = w * strideWidth + wOffset;
+        if ((imRowIdx - paddingHeight) < 0 ||
+            (imRowIdx - paddingHeight) >= inputHeight ||
+            (imColIdx - paddingWidth) < 0 ||
+            (imColIdx - paddingWidth) >= inputWidth) {
+          colData[colh * colWidthSize + colw] = T(0);
+        } else {
+          imRowIdx += c_im * inputHeight - paddingHeight;
+          imColIdx -= paddingWidth;
+          colData[colh * colWidthSize + colw] =
+              imData[imRowIdx * inputWidth + imColIdx];
+        }
+      }
+    }
+  }
+};
+
 }  // namespace paddle

From 19547943bac716d73354fcdb33c6d909b65308b3 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 26 Dec 2017 15:59:11 +0800
Subject: [PATCH 3/6] Add test for Im2ColMobileFunctor.

---
 paddle/function/Im2ColTest.cpp | 80 ++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 1f085538d8..0dc58696f7 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -138,4 +138,84 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
 #endif
 
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                for (size_t dilation : {1 /*, 3*/}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          0,
+                          height,
+                          0,
+                          width);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
 }  // namespace paddle

From a850dec991d7d6d28f2669a959b3198a7a796ce9 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 26 Dec 2017 16:07:09 +0800
Subject: [PATCH 4/6] Add dilation.

---
 paddle/function/GemmConvOp.cpp | 2 ++
 paddle/function/Im2Col.h       | 6 ++++--
 paddle/function/Im2ColTest.cpp | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 75a5b4fe84..acf1415ebf 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -233,6 +233,8 @@ public:
                      strideW(),
                      paddingH(),
                      paddingW(),
+                     dilationH(),
+                     dilationW(),
                      colHeightStart,
                      K,
                      colWidthStart,
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index f43ca465a2..1053e4fd23 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -109,6 +109,8 @@ public:
                   int strideWidth,
                   int paddingHeight,
                   int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
                   int colHeightStart,
                   int colHeightSize,
                   int colWidthStart,
@@ -128,8 +130,8 @@ public:
         int h = (colWidthStart + colw) / outputWidth;
         int w = (colWidthStart + colw) % outputWidth;
 
-        int imRowIdx = h * strideHeight + hOffset;
-        int imColIdx = w * strideWidth + wOffset;
+        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+        int imColIdx = w * strideWidth + wOffset * dilationWidth;
         if ((imRowIdx - paddingHeight) < 0 ||
             (imRowIdx - paddingHeight) >= inputHeight ||
             (imColIdx - paddingWidth) < 0 ||
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 0dc58696f7..c573469168 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -147,7 +147,7 @@ void TestIm2ColMobileFunctor() {
           for (size_t filterWidth : {3, 7}) {
             for (size_t stride : {1, 2}) {
               for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1 /*, 3*/}) {
+                for (size_t dilation : {1, 3}) {
                   size_t filterSizeH = (filterHeight - 1) * dilation + 1;
                   size_t filterSizeW = (filterWidth - 1) * dilation + 1;
                   if (inputHeight + 2 * padding < filterSizeH ||
@@ -200,6 +200,8 @@ void TestIm2ColMobileFunctor() {
                           stride,
                           padding,
                           padding,
+                          dilation,
+                          dilation,
                           0,
                           height,
                           0,

From f453b7137f8ed5a10ff47901401a796338d6e504 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 26 Dec 2017 16:10:15 +0800
Subject: [PATCH 5/6] Refine code.

---
 paddle/function/GemmConvOp.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index acf1415ebf..25cc3df667 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,14 +126,11 @@ public:
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
     }
-#ifdef PADDLE_MOBILE_INFERENCE
-    if (Device == DEVICE_TYPE_CPU) {
-      memory_.reset();
-    }
-#endif
   }
 };
 
+#ifdef PADDLE_MOBILE_INFERENCE
+
 /*
  * \brief Forward calculation of convolution, optimized for mobile.
  */
@@ -284,6 +281,8 @@ public:
   }
 };
 
+#endif
+
 /*
  * \brief Backward input calculation of convolution.
  */

From b7c4b58d3d041d4afe4da3d7f8b7d7366e8dce8d Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 28 Dec 2017 14:51:32 +0800
Subject: [PATCH 6/6] Follow comments.

---
 paddle/function/GemmConvOp.cpp |  6 ++++--
 paddle/function/Im2Col.h       |  2 +-
 paddle/function/Im2ColTest.cpp | 14 +++++++-------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 25cc3df667..cbdbf5335d 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -189,8 +189,8 @@ public:
     size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
     size_t colWidth = outputHeight * outputWidth;
     // Max col matrix height 256, Max col matrix width 1024
-    size_t stepColHeight = std::min(colHeight, (size_t)256);
-    size_t stepColWidth = std::min(colWidth, (size_t)2048);
+    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
 
     if (needIm2col) {
       colShape = TensorShape({inputChannels / groups_,
@@ -278,6 +278,8 @@ public:
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
     }
+
+    memory_.reset();
   }
 };
 
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 1053e4fd23..36a9bcf84e 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -136,7 +136,7 @@ public:
             (imRowIdx - paddingHeight) >= inputHeight ||
             (imColIdx - paddingWidth) < 0 ||
             (imColIdx - paddingWidth) >= inputWidth) {
-          colData[colh * colWidthSize + colw] = T(0);
+          colData[colh * colWidthSize + colw] = static_cast<T>(0);
         } else {
           imRowIdx += c_im * inputHeight - paddingHeight;
           imColIdx -= paddingWidth;
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index c573469168..3ba866dcdd 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -140,13 +140,13 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
 template <class T>
 void TestIm2ColMobileFunctor() {
-  for (size_t channels : {1, 5, 32}) {
-    for (size_t inputHeight : {5, 33, 100}) {
-      for (size_t inputWidth : {5, 32, 96}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
                 for (size_t dilation : {1, 3}) {
                   size_t filterSizeH = (filterHeight - 1) * dilation + 1;
                   size_t filterSizeW = (filterWidth - 1) * dilation + 1;