Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_add_axis

8 years ago · f6e72c93c7
parent 1d9a4d2e50 3fbb692d4b
commit f6e72c93c7
58 changed files with 3684 additions and 889 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -4,7 +4,6 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@ -12,7 +11,6 @@ os:
 env:
  - JOB=build_doc
  - JOB=check_style
  - JOB=build_android
 addons:
  apt:
    packages:
--- a/Dockerfile.android
+++ b/Dockerfile.android
@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 # ENV variables
 ARG ANDROID_ABI
 ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
 ENV HOME=/root \
    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+    ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
    ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
 RUN apt-get update && \
    apt-get install -y \
@ -15,12 +21,11 @@ RUN apt-get update && \
    apt-get clean -y
 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -C /usr/local -xzf go.tgz && \
+    tar -xz -C /usr/local && \
    mkdir /root/gopath && \
    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
+    mkdir /root/gopath/src
    rm go.tgz
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
    rm -rf /opt/android-ndk-tmp && \
    rm -rf ${ANDROID_NDK_HOME}
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@ -20,6 +20,7 @@
 # The supported variables are listed belows:
 # 
 # ANDROID_STANDALONE_TOOLCHAIN
 # ANDROID_TOOLCHAIN
 # ANDROID_ABI
 # ANDROID_NATIVE_API_LEVEL
 # ANDROID_ARM_MODE
@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
    ENDIF()
 ENDIF()
 IF(NOT DEFINED ANDROID_TOOLCHAIN)
    SET(ANDROID_TOOLCHAIN clang)
 ENDIF()
 IF(NOT DEFINED ANDROID_ABI)
    SET(ANDROID_ABI "armeabi-v7a")
 ENDIF()
@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            "${CMAKE_VERSION}), when cross-compiling for Android.")
    IF(ANDROID_STANDALONE_TOOLCHAIN)
        # Use standalone toolchain
        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
        IF(NOT CMAKE_SYSTEM_VERSION)
@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
        # Toolchain
        SET(ANDROID_TOOLCHAIN "gcc")
        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-        IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+    ELSE(ANDROID_NDK)
-            SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+        # TODO: use android ndk
-            IF(ANDROID_ABI STREQUAL "armeabi")
+    ENDIF()
-                SET(CMAKE_SYSTEM_PROCESSOR armv5te)
+
-            ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-            ENDIF()
+        IF(ANDROID_ABI STREQUAL "armeabi")
-        ENDIF()
+            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-        IF(ANDROID_ABI STREQUAL "arm64-v8a")
+            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
-            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
        ENDIF()
-        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
    ELSE()
        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
    ENDIF()
    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
    IF(ANDROID_TOOLCHAIN STREQUAL clang)
        SET(ANDROID_C_COMPILER_NAME clang)
        SET(ANDROID_CXX_COMPILER_NAME clang++)
        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
        SET(ANDROID_C_COMPILER_NAME gcc)
        SET(ANDROID_CXX_COMPILER_NAME g++)
    ELSE()
        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
    ENDIF()
    # C compiler
    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
    ENDIF()
@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    # CXX compiler
    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
    ENDIF()
@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
    IF(ANDROID_ABI STREQUAL "armeabi")
@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
             -march=armv5te
             -mtune=xscale
             -msoft-float)
-    ENDIF()
+    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
        LIST(APPEND ANDROID_COMPILER_FLAGS
             -march=armv7-a
             -mfloat-abi=softfp)
@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
        ENDIF()
        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
    ENDIF()
    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ELSE()
            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
        ENDIF()
        IF(ANDROID_TOOLCHAIN STREQUAL clang)
            # Disable integrated-as for better compatibility.
            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
        ENDIF()
    ENDIF()
-    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        # CMake automatically forwards all compiler flags to the linker,
        # and clang doesn't like having -Wa flags being used for linking.
        # To prevent CMake from doing this would require meddling with
        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
    ENDIF()
    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 IF(USE_EIGEN_FOR_BLAS)
    return()
 ENDIF(USE_EIGEN_FOR_BLAS)
 INCLUDE(cblas)
 IF(NOT ${CBLAS_FOUND})
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@ -0,0 +1,124 @@
 ## Background
 PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
 PaddlePaddle use proto message to describe compile time graph for
 1. Computation graph should be able to be saved to a file.
 1. In distributed training, the graph will be serialized and send to multiple workers.
 The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
 | |compile time|runtime|
 |---|---|---|
 |Data|VarDesc(proto)|Variable(cpp)|
 |Operation|OpDesc(proto)|Operator(cpp)|
 ## Definition of VarDesc
 A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
 ```proto
 message VarDesc {
  required string name = 1;
  optional LoDTensorDesc lod_tensor = 2;
 }
 ```
 ## Definition of LodTensorDesc
 ```proto
 enum DataType {
  BOOL = 0;
  INT16 = 1;
  INT32 = 2;
  INT64 = 3;
  FP16 = 4;
  FP32 = 5;
  FP64 = 6;
 }
 message LoDTensorDesc {
  required DataType data_type = 1;
  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
  optional int32 lod_level = 3 [default=0];
 }
 ```
 ## Definition of Variable in Python
 In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
 ```python
 image = Variable(dims=[-1, 640, 480])
 # fc1 and fc2 are both Variable
 fc1 = layer.fc(input=image, output_size=10)
 fc2 = layer.fc(input=fc1, output_size=20)
 ```
 ### what should class `Variable` Have
 1. `name`.a name of string type is used to mark the value of the Variable.
 1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
 1. `operator`. Variable should record which operator produce itself. The reaon is:
  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
 In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
 ```python
 import VarDesc
 import LoDTensorDesc
 import framework
 def AddInitialOperator(variable, initializer):
 	# add an initialize Operator to block to init this Variable
 class Variable(object):
   def __init__(self, name, dims, type, initializer):
      self._block = get_default_block()
      self._name = name
      self.op = None
      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
      self._var = framework.CreateVar(_var_desc)
      self._block.add_var(self)
      # add initial op according to initializer
      if initializer is not None:
          AddInitialOperator(self, initializer)
   def dims(self):
      return self._var.dims()
   def data_type(self):
       return self._var.data_type()
   def to_proto(self):
       pass
 ```
 Then we can use this Variable to create a fc layer in Python.
 ```python
 import paddle as pd
 def flatten_size(X, num_flatten_dims):
  prod = 1 # of last num_flatten_dims
  for i in xrange(num_flatten_dims):
    prod = prod * X.dims[-i-1]
  return prod
 def layer.fc(X, output_size, num_flatten_dims):
  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
  out = Variable(type=FP32)
  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
  pd.InferShape(y)
  return out
 x = Variable(dims=[-1, 640, 480])
 y = layer.fc(x, output_size=100)
 z = layer.fc(y, output_size=200)
 paddle.eval(targets=[z], ...)
 print(z)
 ```
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@ -18,14 +18,6 @@ limitations under the License. */
 #ifndef __NVCC__
 #include "paddle/math/MathFunctions.h"
 #ifndef PADDLE_TYPE_DOUBLE
 #define     CBLAS_GEMM     paddle::gemm<float>
 #else
 #define     CBLAS_GEMM     paddle::gemm<double>
 #endif
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
                                       real *gateValue,
@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
  }
 }
 template<class OpResetOutput, class OpFinalOutput>
 void hl_cpu_gru_forward(OpResetOutput opResetOutput,
                        OpFinalOutput opFinalOutput,
                        hl_gru_value value,
                        int frameSize,
                        int batchSize,
                        hl_activation_mode_t active_node,
                        hl_activation_mode_t active_gate) {
  if (value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasNoTrans,
               batchSize,
               2 * frameSize,
               frameSize,
               1,
               value.prevOutValue,
               frameSize,
               value.gateWeight,
               frameSize * 2,
               1,
               value.gateValue,
               frameSize * 3);
  }
  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
  if (value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasNoTrans,
               batchSize,
               frameSize,
               frameSize,
               1,
               value.resetOutputValue,
               frameSize,
               value.stateWeight,
               frameSize,
               1,
               value.gateValue + frameSize * 2,
               frameSize * 3);
  }
  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
 }
 template<class OpStateGrad>
 void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
                                      real *gateValue,
@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
  }
 }
 template<class OpStateGrad, class OpResetGrad>
 void hl_cpu_gru_backward(OpStateGrad opStateGrad,
                         OpResetGrad opResetGrad,
                         hl_gru_value value,
                         hl_gru_grad  grad,
                         int frameSize,
                         int batchSize,
                         hl_activation_mode_t active_node,
                         hl_activation_mode_t active_gate) {
  backward_state_grad(opStateGrad, value, grad,
    frameSize, batchSize, active_node);
  if (value.prevOutValue && grad.prevOutGrad) {
    CBLAS_GEMM(CblasNoTrans,
               CblasTrans,
               batchSize,
               frameSize,
               frameSize,
               1,
               grad.gateGrad + frameSize * 2,
               frameSize * 3,
               value.stateWeight,
               frameSize,
               0,
               grad.resetOutputGrad,
               frameSize);
    if (grad.stateWeightGrad) {
      CBLAS_GEMM(CblasTrans,
                 CblasNoTrans,
                 frameSize,
                 frameSize,
                 batchSize,
                 1,
                 value.resetOutputValue,
                 frameSize,
                 grad.gateGrad + frameSize * 2,
                 frameSize * 3,
                 1,
                 grad.stateWeightGrad,
                 frameSize);
    }
  }
  backward_reset_grad(opResetGrad, value, grad,
    frameSize, batchSize, active_gate);
  if (grad.prevOutGrad && value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasTrans,
               batchSize,
               frameSize,
               frameSize * 2,
               1,
               grad.gateGrad,
               frameSize * 3,
               value.gateWeight,
               frameSize * 2,
               1,
               grad.prevOutGrad,
               frameSize);
    if (grad.gateWeightGrad) {
      CBLAS_GEMM(CblasTrans,
                 CblasNoTrans,
                 frameSize,
                 frameSize * 2,
                 batchSize,
                 1,
                 value.prevOutValue,
                 frameSize,
                 grad.gateGrad,
                 frameSize * 3,
                 1,
                 grad.gateWeightGrad,
                 frameSize * 2);
    }
  }
 }
 #endif
 #endif  // HL_CPU_GRU_CUH_
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -44,6 +44,7 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
    add_simple_unittest(SwitchOpTest)
 endif()
 add_simple_unittest(Im2ColTest)
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@ -83,9 +83,9 @@ struct EigenBlasGemm {
 };
 #ifdef PADDLE_TYPE_DOUBLE
-template class EigenBlasGemm<double>;
+template struct EigenBlasGemm<double>;
 #else
-template class EigenBlasGemm<float>;
+template struct EigenBlasGemm<float>;
 #endif
 }  // namespace paddle
--- a/paddle/function/GruFunctor.h
+++ b/paddle/function/GruFunctor.h
@ -0,0 +1,159 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "GemmFunctor.h"
 #include "hl_cpu_gru.cuh"
 namespace paddle {
 template <DeviceType Device, class T>
 struct GruFunctor {
  template <class OpResetOutput, class OpFinalOutput>
  static void compute(OpResetOutput opResetOutput,
                      OpFinalOutput opFinalOutput,
                      hl_gru_value value,
                      int frameSize,
                      int batchSize,
                      hl_activation_mode_t active_node,
                      hl_activation_mode_t active_gate) {
 #ifndef __NVCC__
    if (value.prevOutValue) {
      BlasGemm<Device, T>::compute(false,
                                   false,
                                   batchSize,
                                   2 * frameSize,
                                   frameSize,
                                   1,
                                   value.prevOutValue,
                                   frameSize,
                                   value.gateWeight,
                                   frameSize * 2,
                                   1,
                                   value.gateValue,
                                   frameSize * 3);
    }
    forward_reset_output(
        opResetOutput, value, frameSize, batchSize, active_gate);
    if (value.prevOutValue) {
      BlasGemm<Device, T>::compute(false,
                                   false,
                                   batchSize,
                                   frameSize,
                                   frameSize,
                                   1,
                                   value.resetOutputValue,
                                   frameSize,
                                   value.stateWeight,
                                   frameSize,
                                   1,
                                   value.gateValue + frameSize * 2,
                                   frameSize * 3);
    }
    forward_final_output(
        opFinalOutput, value, frameSize, batchSize, active_node);
 #endif
  }
 };
 template <DeviceType Device, class T>
 struct GruGradFunctor {
  template <class OpStateGrad, class OpResetGrad>
  static void compute(OpStateGrad opStateGrad,
                      OpResetGrad opResetGrad,
                      hl_gru_value value,
                      hl_gru_grad grad,
                      int frameSize,
                      int batchSize,
                      hl_activation_mode_t active_node,
                      hl_activation_mode_t active_gate) {
 #ifndef __NVCC__
    backward_state_grad(
        opStateGrad, value, grad, frameSize, batchSize, active_node);
    if (value.prevOutValue && grad.prevOutGrad) {
      BlasGemm<Device, T>::compute(false,
                                   true,
                                   batchSize,
                                   frameSize,
                                   frameSize,
                                   1,
                                   grad.gateGrad + frameSize * 2,
                                   frameSize * 3,
                                   value.stateWeight,
                                   frameSize,
                                   0,
                                   grad.resetOutputGrad,
                                   frameSize);
      if (grad.stateWeightGrad) {
        BlasGemm<Device, T>::compute(true,
                                     false,
                                     frameSize,
                                     frameSize,
                                     batchSize,
                                     1,
                                     value.resetOutputValue,
                                     frameSize,
                                     grad.gateGrad + frameSize * 2,
                                     frameSize * 3,
                                     1,
                                     grad.stateWeightGrad,
                                     frameSize);
      }
    }
    backward_reset_grad(
        opResetGrad, value, grad, frameSize, batchSize, active_gate);
    if (grad.prevOutGrad && value.prevOutValue) {
      BlasGemm<Device, T>::compute(false,
                                   true,
                                   batchSize,
                                   frameSize,
                                   frameSize * 2,
                                   1,
                                   grad.gateGrad,
                                   frameSize * 3,
                                   value.gateWeight,
                                   frameSize * 2,
                                   1,
                                   grad.prevOutGrad,
                                   frameSize);
      if (grad.gateWeightGrad) {
        BlasGemm<Device, T>::compute(true,
                                     false,
                                     frameSize,
                                     frameSize * 2,
                                     batchSize,
                                     1,
                                     value.prevOutValue,
                                     frameSize,
                                     grad.gateGrad,
                                     frameSize * 3,
                                     1,
                                     grad.gateWeightGrad,
                                     frameSize * 2);
      }
    }
 #endif
  }
 };
 }  // namespace paddle
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@ -94,95 +94,4 @@ public:
                  int paddingWidth);
 };
 template <class T>
 struct Padding {
  static void run(const T* src,
                  T* dest,
                  int channels,
                  int inputHeight,
                  int inputWidth,
                  int paddingHeight,
                  int paddingWidth) {
    const int destWidth = inputWidth + 2 * paddingWidth;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
        dest += destWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = T(0);
        }
        memcpy(dest, src, inputWidth * sizeof(T));
        dest += inputWidth;
        src += inputWidth;
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = T(0);
        }
      }
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
        dest += destWidth * paddingHeight;
      }
    }
  }
 };
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <>
 struct Padding<float> {
  static void run(const float* src,
                  float* dest,
                  int channels,
                  int inputHeight,
                  int inputWidth,
                  int paddingHeight,
                  int paddingWidth) {
    const int destWidth = inputWidth + 2 * paddingWidth;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
        dest += destWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = float(0);
        }
        int step = inputWidth >> 2;
        int remain = inputWidth & 3;
        for (int s = 0; s < step; s++) {
          float32x4_t s0 = vld1q_f32(src);
          vst1q_f32(dest, s0);
          src += 4;
          dest += 4;
        }
        for (int r = 0; r < remain; r++) {
          *dest++ = *src++;
        }
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = float(0);
        }
      }
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
        dest += destWidth * paddingHeight;
      }
    }
  }
 };
 #endif
 }  // namespace paddle
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "MulOp.h"
-/// todo(tianbing), delete it
+#include "GemmFunctor.h"
 #include <iostream>
 #include "paddle/math/MathFunctions.h"
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/ThreadLocal.h"
 #ifndef PADDLE_TYPE_DOUBLE
 #define GEMM paddle::gemm<float>
 #else
 #define GEMM paddle::gemm<double>
 #endif
 namespace {
 inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
  for (unsigned int i = 0; i < len; ++i) {
@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                            real scaleT,
                            bool aTrans,
                            bool bTrans) {
-  GEMM(aTrans ? CblasTrans : CblasNoTrans,
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-       bTrans ? CblasTrans : CblasNoTrans,
+      aTrans,
-       out.getHeight(),
+      bTrans,
-       out.getWidth(),
+      out.getHeight(),
-       !aTrans ? a.getWidth() : a.getHeight(),
+      out.getWidth(),
-       scaleAB,
+      !aTrans ? a.getWidth() : a.getHeight(),
-       a.getData(),
+      scaleAB,
-       a.getStride(),
+      a.getData(),
-       b.getData(),
+      a.getStride(),
-       b.getStride(),
+      b.getData(),
-       scaleT,
+      b.getStride(),
-       out.getData(),
+      scaleT,
-       out.getStride());
+      out.getData(),
      out.getStride());
 }
 /// dense matrix (+)= sparse matrix * dense matrix
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@ -0,0 +1,140 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "SwitchOp.h"
 #include "paddle/math/Vector.h"
 namespace paddle {
 template <>
 void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
                                const real* inputs,
                                const int num,
                                const int inC,
                                const int inH,
                                const int inW,
                                const int argType) {
  for (int n = 0; n < num; ++n) {
    for (int c = 0; c < inC; ++c) {
      for (int h = 0; h < inH; ++h) {
        for (int w = 0; w < inW; ++w) {
          if (argType == ADD_TO) {
            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
          } else {
            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
          }
        }
      }
    }
  }
 }
 template <>
 void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
                                const real* inputs,
                                const int num,
                                const int inH,
                                const int inW,
                                const int inC,
                                const int argType) {
  for (int n = 0; n < num; ++n) {
    for (int h = 0; h < inH; ++h) {
      for (int w = 0; w < inW; ++w) {
        for (int c = 0; c < inC; ++c) {
          if (argType == ADD_TO) {
            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
          } else {
            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
          }
        }
      }
    }
  }
 }
 /**
 * \brief  Switch dimension order of image input.
 *         The input and output is a 4D tensor. Switch order
 *         'batch_size,channels, height, width' to
 *         order 'batch_size, height, width, channels'.
 *
 * Argument in this Function:
 * \param inputs  input data with order 'batch_size,channels, height, width'.
 * \param outputs output data with order 'batch_size, height, width, channels'.
 */
 template <DeviceType Device>
 class NCHW2NHWCFunc : public FunctionBase {
 public:
  void init(const FuncConfig& config) override {}
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(1UL, inputs.size());
    CHECK_EQ(1UL, outputs.size());
    size_t num = inputs[0].shape()[0];
    size_t inC = inputs[0].shape()[1];
    size_t inH = inputs[0].shape()[2];
    size_t inW = inputs[0].shape()[3];
    NCHW2NHWC<Device>(outputs[0].data<real>(),
                      inputs[0].data<real>(),
                      num,
                      inC,
                      inH,
                      inW,
                      outputs[0].getArgType());
  }
 };
 /**
 * \brief  Switch dimension order of image input.
 *         The input and output is a 4D tensor. Switch order
 *         'batch_size, height, width, channels' to
 *         order 'batch_size, channels, height, width'.
 *
 * Argument in this Function:
 * \param inputs  input data with order 'batch_size, height, width, channels'.
 * \param outputs output data with order 'batch_size, channels, height, width'.
 */
 template <DeviceType Device>
 class NHWC2NCHWFunc : public FunctionBase {
 public:
  void init(const FuncConfig& config) override {}
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(1UL, inputs.size());
    CHECK_EQ(1UL, outputs.size());
    size_t num = inputs[0].shape()[0];
    size_t inH = inputs[0].shape()[1];
    size_t inW = inputs[0].shape()[2];
    size_t inC = inputs[0].shape()[3];
    NHWC2NCHW<Device>(outputs[0].data<real>(),
                      inputs[0].data<real>(),
                      num,
                      inH,
                      inW,
                      inC,
                      outputs[0].getArgType());
  }
 };
 REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
 #ifndef PADDLE_ONLY_CPU
 REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
 #endif
 }  // namespace paddle
--- a/paddle/function/SwitchOp.h
+++ b/paddle/function/SwitchOp.h
@ -0,0 +1,66 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "Function.h"
 namespace paddle {
 /**
 * \brief  This funtion switch dimension order of image input.
 *         The input and output is a 4D tensor. Switch order 'batch_size,
 *channels, height, width' to
 *         order 'batch_size, height, width, channels'.
 *
 * \param[out] outputs save results.
 * \param[in]  inputs  input data.
 * \param[in]  num     batch size of input data.
 * \param[in]  inC     channel number of input data.
 * \param[in]  inH     height of input data.
 * \param[in]  inH     with of input data.
 * \param[in]  argType     type of output argument.
 */
 template <DeviceType Device>
 void NCHW2NHWC(real* outputs,
               const real* inputs,
               const int num,
               const int inC,
               const int inH,
               const int inW,
               const int argtype);
 /**
 * \brief  This funtion switch dimension order of image input.
 *         The input and output is a 4D tensor. Switch order 'batch_size,
 *height, width, channels' to
 *         order 'batch_size, channels, height, width'.
 *
 * \param[out] inGrad  gradients of previous layer.
 * \param[in]  outGrad output gradients.
 * \param[in]  num     batch size of input data.
 * \param[in]  inH     height of input data.
 * \param[in]  inW     with of input data.
 * \param[in]  inC     channel number of input data.
 * \param[in]  argType     type of output argument.
 */
 template <DeviceType Device>
 void NHWC2NCHW(real* inGrad,
               const real* outGrad,
               const int num,
               const int inH,
               const int inW,
               const int inC,
               const int argType);
 }  // namespace paddle
--- a/paddle/function/SwitchOpGpu.cu
+++ b/paddle/function/SwitchOpGpu.cu
@ -0,0 +1,98 @@
 /* Copyright (c) 2016 Paddle
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "SwitchOp.h"
 #include "hl_base.h"
 namespace paddle {
 __global__ void KeNCHW2NHWC(real* outputs,
                            const real* inputs,
                            int inC,
                            int inH,
                            int inW,
                            int nthreads,
                            int argType) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
    const int h = (idx / inW) % inH;
    const int c = (idx / inW / inH) % inC;
    const int n = idx / inW / inH / inC;
    const int off = ((n * inH + h) * inW + w) * inC + c;
    if (argType == ADD_TO) {
      outputs[off] += inputs[idx];
    } else {
      outputs[off] = inputs[idx];
    }
  }
 }
 template <>
 void NCHW2NHWC<DEVICE_TYPE_GPU>(real* outputs,
                                const real* inputs,
                                const int num,
                                const int inC,
                                const int inH,
                                const int inW,
                                const int argType) {
  size_t nth = num * inC * inH * inW;
  int blockSize = 1024;
  int gridSize = (nth + 1024 - 1) / 1024;
  KeNCHW2NHWC<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
      outputs, inputs, inC, inH, inW, nth, argType);
  CHECK_SYNC("NCHW2NHWC");
 }
 __global__ void KeNHWC2NCHW(real* outputs,
                            const real* inputs,
                            int inH,
                            int inW,
                            int inC,
                            int nthreads,
                            int argType) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int c = idx % inC;
    const int w = (idx / inC) % inW;
    const int h = (idx / inC / inW) % inH;
    const int n = idx / inW / inH / inC;
    const int off = ((n * inC + c) * inH + h) * inW + w;
    if (argType == ADD_TO) {
      outputs[off] += inputs[idx];
    } else {
      outputs[off] = inputs[idx];
    }
  }
 }
 template <>
 void NHWC2NCHW<DEVICE_TYPE_GPU>(real* outputs,
                                const real* inputs,
                                const int num,
                                const int inH,
                                const int inW,
                                const int inC,
                                const int argType) {
  int nth = num * inC * inH * inW;
  int blockSize = 1024;
  int gridSize = (nth + 1024 - 1) / 1024;
  KeNHWC2NCHW<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
      outputs, inputs, inH, inW, inC, nth, argType);
  CHECK_SYNC("NHWC2NCHW");
 }
 }  // namespace paddle
--- a/paddle/function/SwitchOpTest.cpp
+++ b/paddle/function/SwitchOpTest.cpp
@ -0,0 +1,44 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
 #include "FunctionTest.h"
 namespace paddle {
 TEST(Pad, real) {
  for (size_t numSamples : {1, 4, 8, 16}) {
    for (size_t channels : {1, 4, 8, 16}) {
      for (size_t imgSizeH : {1, 4, 8, 16}) {
        for (size_t imgSizeW : {1, 4, 8, 16}) {
          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
          for (bool test_grad : {true, false}) {
            CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC",
                                      FuncConfig());
            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
            TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels};
            compare.addInputs(
                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
            compare.addOutputs(BufferArg(
                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
            compare.run();
          }
        }
      }
    }
  }
 }
 }  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
@ -0,0 +1,136 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "NeonDepthwiseConv.h"
 #include "paddle/function/ConvOp.h"
 namespace paddle {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <DeviceType Device>
 class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
 public:
  void init(const FuncConfig& config) override {
    ConvFunctionBase::init(config);
  }
  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
    checkShape(input, filter, output);
  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
    check(inputs, outputs);
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
    int batchSize = input[0];
    int inputChannels = input[1];
    int inputHeight = input[2];
    int inputWidth = input[3];
    int filterHeight = getFilterHeight(filter);
    int filterWidth = getFilterWidth(filter);
    int outputChannels = output[1];
    int outputHeight = output[2];
    int outputWidth = output[3];
    int filterMultiplier = outputChannels / groups_;
    CHECK_EQ(inputChannels, groups_);
    // only support strideH() == strideW() and filterHeight == filterWidth.
    CHECK_EQ(strideH(), strideW());
    CHECK_EQ(paddingH(), paddingW());
    CHECK_EQ(filterHeight, filterWidth);
    float* inputData = inputs[0].data<float>();
    float* filterData = inputs[1].data<float>();
    float* outputData = outputs[0].data<float>();
    // padding the input, input -> inputPadding
    float* inputPadding = inputData;
    int padInputHeight =
        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
    int padInputWidth =
        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
      resizeBuffer<Device>(newSize);
      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
      if (strideH() == 1) {
        neon::Padding<float>::run(inputData,
                                  inputPadding,
                                  batchSize * inputChannels,
                                  inputHeight,
                                  inputWidth,
                                  padInputHeight,
                                  padInputWidth);
      } else if (strideH() == 2) {
        neon::StridePadding::run(inputData,
                                 inputPadding,
                                 batchSize * inputChannels,
                                 inputHeight,
                                 inputWidth,
                                 padInputHeight,
                                 padInputWidth);
      } else {
        LOG(FATAL) << "Not supported";
      }
    }
    std::function<void(
        const float*, const float*, int, int, int, int, int, int, float*)>
        DepthWiseConv;
    if (filterWidth == 3) {
      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
    } else if (filterWidth == 4) {
      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
    } else {
      LOG(FATAL) << "Not supported";
    }
    for (int i = 0; i < batchSize; i++) {
      DepthWiseConv(inputPadding,
                    filterData,
                    padInputHeight,
                    padInputWidth,
                    outputChannels,
                    outputHeight,
                    outputWidth,
                    filterMultiplier,
                    outputData);
      inputPadding += inputChannels * padInputHeight * padInputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
    }
  }
 };
 #ifndef PADDLE_TYPE_DOUBLE
 REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
                    CPU,
                    NeonDepthwiseConvTransposeFunction);
 #endif
 #endif
 }  // namespace paddle
--- a/paddle/function/neon/neon_util.h
+++ b/paddle/function/neon/neon_util.h
@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) {
  return vget_lane_f32(vpadd_f32(v, v), 0);
 }
-inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
+#define vmlaq_laneq_f32(a, b, v, lane) \
-                                   float32x4_t b,
+  vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
                                   float32x4_t v,
                                   const int lane) {
  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
 }
 #endif
 }  // namespace neon
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@ -14,6 +14,7 @@ limitations under the License. */
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
 #include "paddle/function/GruFunctor.h"
 #include "paddle/utils/Util.h"
 namespace paddle {
@ -25,13 +26,13 @@ void GruCompute::init(LayerConfig &config) {
 template <>
 void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  hl_cpu_gru_forward(hppl::forward::gru_resetOutput(),
+  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
-                     hppl::forward::gru_finalOutput(),
+                                             hppl::forward::gru_finalOutput(),
-                     value,
+                                             value,
-                     frameSize,
+                                             frameSize,
-                     batchSize,
+                                             batchSize,
-                     activeNode_,
+                                             activeNode_,
-                     activeGate_);
+                                             activeGate_);
 }
 template <>
@ -39,14 +40,15 @@ void GruCompute::backward<0>(hl_gru_value value,
                             hl_gru_grad grad,
                             int frameSize,
                             int batchSize) {
-  hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
+  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
-                      hppl::backward::gru_resetGrad(),
+      hppl::backward::gru_stateGrad(),
-                      value,
+      hppl::backward::gru_resetGrad(),
-                      grad,
+      value,
-                      frameSize,
+      grad,
-                      batchSize,
+      frameSize,
-                      activeNode_,
+      batchSize,
-                      activeGate_);
+      activeNode_,
      activeGate_);
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/SwitchOrderLayer.cpp
+++ b/paddle/gserver/layers/SwitchOrderLayer.cpp
@ -0,0 +1,107 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "SwitchOrderLayer.h"
 #include "paddle/utils/Stat.h"
 namespace paddle {
 REGISTER_LAYER(switch_order, SwitchOrderLayer);
 bool SwitchOrderLayer::init(const LayerMap& layerMap,
                            const ParameterMap& parameterMap) {
  /* Initialize the basic parent class */
  Layer::init(layerMap, parameterMap);
  auto& img_conf = config_.inputs(0).image_conf();
  size_t inH =
      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
  size_t inW = img_conf.img_size();
  size_t inC = img_conf.channels();
  inDims_ = TensorShape({0, inC, inH, inW});
  outDims_ = TensorShape(4);
  auto& reshape_conf = config_.reshape_conf();
  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
    heightAxis_.push_back(reshape_conf.height_axis(i));
  }
  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
    widthAxis_.push_back(reshape_conf.width_axis(i));
  }
  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
  return true;
 }
 void SwitchOrderLayer::setOutDims() {
  outDims_.setDim(0, inDims_[0]);
  outDims_.setDim(1, inDims_[2]);
  outDims_.setDim(2, inDims_[3]);
  outDims_.setDim(3, inDims_[1]);
  reshapeHeight_ = 1;
  for (size_t i = 0; i < heightAxis_.size(); i++) {
    reshapeHeight_ *= outDims_[heightAxis_[i]];
  }
  output_.setFrameHeight(reshapeHeight_);
  reshapeWidth_ = 1;
  for (size_t i = 0; i < widthAxis_.size(); i++) {
    reshapeWidth_ *= outDims_[widthAxis_[i]];
  }
  output_.setFrameWidth(reshapeWidth_);
 }
 void SwitchOrderLayer::setInDims() {
  MatrixPtr input = inputLayers_[0]->getOutputValue();
  size_t batchSize = input->getHeight();
  inDims_.setDim(0, batchSize);
  int h = inputLayers_[0]->getOutput().getFrameHeight();
  if (h != 0) inDims_.setDim(2, h);
  int w = inputLayers_[0]->getOutput().getFrameWidth();
  if (w != 0) inDims_.setDim(3, w);
  int totalCount = input->getElementCnt();
  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
  if (channels != 0) inDims_.setDim(1, channels);
 }
 void SwitchOrderLayer::forward(PassType passType) {
  Layer::forward(passType);
  setInDims();
  setOutDims();
  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
  if (heightAxis_.size() > 0) {
    getOutputValue()->reshape(reshapeHeight_, reshapeWidth_);
    getOutputGrad()->reshape(reshapeHeight_, reshapeWidth_);
  }
  // switch NCHW to NHWC
  BufferArgs inputs;
  BufferArgs outputs;
  inputs.addArg(*getInputValue(0), inDims_);
  outputs.addArg(*getOutputValue(), outDims_);
  nchw2nhwc_[0]->calc(inputs, outputs);
  forwardActivation();
 }
 void SwitchOrderLayer::backward(const UpdateCallback& callback) {
  (void)callback;
  backwardActivation();
  // switch NHWC to NCHW
  BufferArgs inputs;
  BufferArgs outputs;
  inputs.addArg(*getOutputGrad(), outDims_);
  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
  nhwc2nchw_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/SwitchOrderLayer.h
+++ b/paddle/gserver/layers/SwitchOrderLayer.h
@ -0,0 +1,47 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "Layer.h"
 namespace paddle {
 /**
 * \brief  This layer calculate softmax in image channel dimension.
 */
 class SwitchOrderLayer : public Layer {
 public:
  explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
  ~SwitchOrderLayer() {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback = nullptr) override;
  void setInDims();
  void setOutDims();
 protected:
  std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
  std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
  TensorShape inDims_;
  TensorShape outDims_;
  std::vector<int> heightAxis_;
  std::vector<int> widthAxis_;
  size_t reshapeHeight_;
  size_t reshapeWidth_;
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -2008,6 +2008,31 @@ TEST(Layer, CropLayer) {
  }
 }
 TEST(Layer, SwitchOrderLayer) {
  TestConfig config;
  // config input_0
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  ImageConfig* img = input->mutable_image_conf();
  img->set_channels(4);
  img->set_img_size(16);
  img->set_img_size_y(16);
  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
  reshape->add_height_axis(0);
  reshape->add_height_axis(1);
  reshape->add_height_axis(2);
  reshape->add_width_axis(3);
  // config softmax layer
  config.layerConfig.set_type("switch_order");
  config.layerConfig.set_name("switchOrderLayer");
  for (auto useGpu : {false, true}) {
    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
  }
 }
 vector<real> randSampling(real range, int n) {
  CHECK_GE(range, n);
  vector<real> num(range);
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
 namespace paddle {
 #ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void gemm<float>(const CBLAS_TRANSPOSE transA,
                 const CBLAS_TRANSPOSE transB,
@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
              C,
              ldc);
 }
 #endif
 template <>
 int getrf<float>(const CBLAS_ORDER order,
@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
 }
 #ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
  cblas_saxpy(n, alpha, x, 1, y, 1);
@ -201,6 +204,7 @@ template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
  return cblas_ddot(n, x, 1, y, 1);
 }
 #endif
 #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@ -40,7 +40,14 @@ extern "C" {
 #ifndef LAPACK_FOUND
 extern "C" {
 #ifndef PADDLE_USE_EIGEN_FOR_BLAS
 #include <cblas.h>
 #else
 typedef enum CBLAS_ORDER {
  CblasRowMajor = 101,
  CblasColMajor = 102
 } CBLAS_ORDER;
 #endif
 int LAPACKE_sgetrf(
    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
 int LAPACKE_dgetrf(
@ -56,6 +63,7 @@ int LAPACKE_dgetri(
 namespace paddle {
 #ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <class T>
 void gemm(const CBLAS_TRANSPOSE transA,
          const CBLAS_TRANSPOSE transB,
@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
          const T beta,
          T* C,
          const int ldc);
 #endif
 template <class T>
 int getrf(const CBLAS_ORDER Order,
@ -84,10 +93,21 @@ int getri(
    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
 template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y);
+void axpy(const int n, const T alpha, const T* x, T* y) {
  /// y = y + alpha * x
  for (int i = 0; i < n; i++) {
    y[i] = y[i] + alpha * x[i];
  }
 }
 template <class T>
-T dotProduct(const int n, const T* x, const T* y);
+T dotProduct(const int n, const T* x, const T* y) {
  T result = static_cast<T>(0);
  for (int i = 0; i < n; i++) {
    result += x[i] * y[i];
  }
  return result;
 }
 template <class T>
 void vExp(const int n, const T* a, T* r);
--- a/Show More
+++ b/Show More