Merge pull request #3718 from hedaoyuan/convolution

Depthwise Convolution Optimization
8 years ago · b45d020faf
parent 47eb869197 168707cadd
commit b45d020faf
6 changed files with 750 additions and 6 deletions
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -21,6 +21,8 @@ if(USE_NNPACK)
  endif()
 endif()
 list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function paddle_proto)
@ -42,11 +44,11 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
 add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_style_check_target(paddle_function ${h_files})
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
 }
 #endif
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 TEST(DepthwiseConv, Forward) {
  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
 }
 #endif
 }  // namespace paddle
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@ -16,6 +16,7 @@ limitations under the License. */
 #include "TensorShape.h"
 #include "TensorType.h"
 #include "neon/neon_util.h"
 namespace paddle {
@ -93,4 +94,95 @@ public:
                  int paddingWidth);
 };
 template <class T>
 struct Padding {
  static void run(const T* src,
                  T* dest,
                  int channels,
                  int inputHeight,
                  int inputWidth,
                  int paddingHeight,
                  int paddingWidth) {
    const int destWidth = inputWidth + 2 * paddingWidth;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
        dest += destWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = T(0);
        }
        memcpy(dest, src, inputWidth * sizeof(T));
        dest += inputWidth;
        src += inputWidth;
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = T(0);
        }
      }
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
        dest += destWidth * paddingHeight;
      }
    }
  }
 };
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <>
 struct Padding<float> {
  static void run(const float* src,
                  float* dest,
                  int channels,
                  int inputHeight,
                  int inputWidth,
                  int paddingHeight,
                  int paddingWidth) {
    const int destWidth = inputWidth + 2 * paddingWidth;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
        dest += destWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = float(0);
        }
        int step = inputWidth >> 2;
        int remain = inputWidth & 3;
        for (int s = 0; s < step; s++) {
          float32x4_t s0 = vld1q_f32(src);
          vst1q_f32(dest, s0);
          src += 4;
          dest += 4;
        }
        for (int r = 0; r < remain; r++) {
          *dest++ = *src++;
        }
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = float(0);
        }
      }
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
        dest += destWidth * paddingHeight;
      }
    }
  }
 };
 #endif
 }  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/neon_util.h
+++ b/paddle/function/neon/neon_util.h
@ -0,0 +1,47 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 namespace paddle {
 namespace neon {
 inline float32x4_t vld1q_f32_aligned(const float* p) {
  return vld1q_f32(
      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
 }
 #ifndef __aarch64__
 inline float32_t vaddvq_f32(float32x4_t a) {
  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
  return vget_lane_f32(vpadd_f32(v, v), 0);
 }
 inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
                                   float32x4_t b,
                                   float32x4_t v,
                                   const int lane) {
  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
 }
 #endif
 }  // namespace neon
 }  // namespace paddle
 #endif
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@ -29,6 +29,10 @@ namespace paddle {
 REGISTER_LAYER(exconv, ExpandConvLayer);
 REGISTER_LAYER(exconvt, ExpandConvLayer);
 inline bool isDepthwiseConv(int channels, int groups) {
  return channels == groups;
 }
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
  /* Initialize the basic convolutional parent class */
@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
+    // Convolution Layer uses the GemmConv function by default.
    convType = "GemmConv";
    convGradInputType = "GemmConvGradInput";
    convGradFilterType = "GemmConvGradFilter";
    // If depth wise convolution and useGpu == true
    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
      convType = "DepthwiseConv";
      convGradInputType = "DepthwiseConvGradInput";
      convGradFilterType = "DepthwiseConvGradFilter";
-    } else {
+    }
-      convType = "GemmConv";
+
-      convGradInputType = "GemmConvGradInput";
+    // If depth wise convolution and useGpu == false and ARM-NEON
-      convGradFilterType = "GemmConvGradFilter";
+    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
      if ((filterSize_[i] == filterSizeY_[i]) &&
          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
        convType = "NeonDepthwiseConv";
      }
 #endif
    }
    if (FLAGS_use_nnpack && !isDeconv_) {