From 5ca4118451a38a8fa1e876fd5416028010ec218b Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 22 Aug 2017 17:27:04 +0800 Subject: [PATCH 01/11] Update Dockerfile of android to support building for arm64-v8a and armeabi. --- Dockerfile.android | 18 ++++--- paddle/scripts/docker/build_android.sh | 65 +++++++++++++++++++------- 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/Dockerfile.android b/Dockerfile.android index c0fa58c384..aa95abb366 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors ARG UBUNTU_MIRROR RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' +# ENV variables +ARG ANDROID_ABI + +ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} + ENV HOME=/root \ ANDROID_NDK_HOME=/opt/android-ndk-linux \ - ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc + ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain-gcc \ + ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain-gcc RUN apt-get update && \ apt-get install -y \ @@ -15,12 +21,11 @@ RUN apt-get update && \ apt-get clean -y # Install Go and glide -RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ - tar -C /usr/local -xzf go.tgz && \ +RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ mkdir /root/gopath && \ mkdir /root/gopath/bin && \ - mkdir /root/gopath/src && \ - rm go.tgz + mkdir /root/gopath/src ENV GOROOT=/usr/local/go GOPATH=/root/gopath # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin @@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \ wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-21 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \ rm -rf /opt/android-ndk-tmp && \ rm -rf ${ANDROID_NDK_HOME} diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 5584e29e2a..593ae28e49 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,22 +2,55 @@ set -xe -mkdir -p /paddle/build_android -cd /paddle/build_android +mkdir -p /paddle/build_android/$ANDROID_ABI +cd /paddle/build_android/$ANDROID_ABI rm -rf /paddle/install 2>/dev/null || true -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ - -DANDROID_ABI=armeabi-v7a \ - -DANDROID_ARM_NEON=ON \ - -DANDROID_ARM_MODE=ON \ - -DHOST_C_COMPILER=/usr/bin/gcc \ - -DHOST_CXX_COMPILER=/usr/bin/g++ \ - -DCMAKE_INSTALL_PREFIX=/paddle/install \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_C_FLAGS_RELWITHDEBINFO="-O3" \ - -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3" \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. + +THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI + +if [ $ANDROID_ABI == "armeabi-v7a" ]; then + cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ + -DANDROID_ABI=$ANDROID_ABI \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DHOST_C_COMPILER=/usr/bin/gcc \ + -DHOST_CXX_COMPILER=/usr/bin/g++ \ + -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + /paddle +elif [ $ANDROID_ABI == "arm64-v7a" ]; then + cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ + -DANDROID_ABI=$ANDROID_ABI \ + -DANDROID_ARM_MODE=ON \ + -DHOST_C_COMPILER=/usr/bin/gcc \ + -DHOST_CXX_COMPILER=/usr/bin/g++ \ + -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + /paddle +elif [ $ANDROID_ABI == "armeabi" ]; then + cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ + -DANDROID_ABI=$ANDROID_ABI \ + -DANDROID_ARM_MODE=ON \ + -DHOST_C_COMPILER=/usr/bin/gcc \ + -DHOST_CXX_COMPILER=/usr/bin/g++ \ + -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + /paddle +else + echo "Invalid ANDROID_ABI: $ANDROID_ABI" +fi + make -j `nproc` make install -j `nproc` From 8a4fad4248e942061586538e8de14a7d08052330 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 23 Aug 2017 19:43:57 +0800 Subject: [PATCH 02/11] Support to use clang for Android cross-compiling. --- cmake/cblas.cmake | 4 + cmake/external/warpctc.cmake | 1 + paddle/cuda/include/hl_cpu_gru.cuh | 166 ++++++++++++------------- paddle/function/MulOp.cpp | 37 +++--- paddle/math/MathFunctions.cpp | 4 + paddle/math/MathFunctions.h | 23 +++- paddle/math/Matrix.cpp | 18 ++- paddle/scripts/docker/build_android.sh | 24 ++-- 8 files changed, 155 insertions(+), 122 deletions(-) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 854066fd1d..ab111eccc0 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -13,6 +13,10 @@ # system paths. # +if(USE_EIGEN_FOR_BLAS) + return() +endif(USE_EIGEN_FOR_BLAS) + set(CBLAS_FOUND OFF) ## Find MKLML First. diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 2d7daed9bc..3cc652bed5 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App ELSE() SET(USE_OMP ON) ENDIF() +SET(USE_OMP OFF FORCE) ExternalProject_Add( extern_warpctc diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh index c0a37ced2a..732799a28b 100644 --- a/paddle/cuda/include/hl_cpu_gru.cuh +++ b/paddle/cuda/include/hl_cpu_gru.cuh @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/math/MathFunctions.h" -#ifndef PADDLE_TYPE_DOUBLE -#define CBLAS_GEMM paddle::gemm -#else -#define CBLAS_GEMM paddle::gemm -#endif +// #ifndef PADDLE_TYPE_DOUBLE +// #define CBLAS_GEMM paddle::gemm +// #else +// #define CBLAS_GEMM paddle::gemm +// #endif template void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, @@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput, hl_activation_mode_t active_node, hl_activation_mode_t active_gate) { if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - 2 * frameSize, - frameSize, - 1, - value.prevOutValue, - frameSize, - value.gateWeight, - frameSize * 2, - 1, - value.gateValue, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// 2 * frameSize, +// frameSize, +// 1, +// value.prevOutValue, +// frameSize, +// value.gateWeight, +// frameSize * 2, +// 1, +// value.gateValue, +// frameSize * 3); } forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate); if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - frameSize, - frameSize, - 1, - value.resetOutputValue, - frameSize, - value.stateWeight, - frameSize, - 1, - value.gateValue + frameSize * 2, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// value.resetOutputValue, +// frameSize, +// value.stateWeight, +// frameSize, +// 1, +// value.gateValue + frameSize * 2, +// frameSize * 3); } forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node); @@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_node); if (value.prevOutValue && grad.prevOutGrad) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize, - 1, - grad.gateGrad + frameSize * 2, - frameSize * 3, - value.stateWeight, - frameSize, - 0, - grad.resetOutputGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// value.stateWeight, +// frameSize, +// 0, +// grad.resetOutputGrad, +// frameSize); if (grad.stateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize, - batchSize, - 1, - value.resetOutputValue, - frameSize, - grad.gateGrad + frameSize * 2, - frameSize * 3, - 1, - grad.stateWeightGrad, - frameSize); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize, +// batchSize, +// 1, +// value.resetOutputValue, +// frameSize, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// 1, +// grad.stateWeightGrad, +// frameSize); } } @@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_gate); if (grad.prevOutGrad && value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize * 2, - 1, - grad.gateGrad, - frameSize * 3, - value.gateWeight, - frameSize * 2, - 1, - grad.prevOutGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize * 2, +// 1, +// grad.gateGrad, +// frameSize * 3, +// value.gateWeight, +// frameSize * 2, +// 1, +// grad.prevOutGrad, +// frameSize); if (grad.gateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize * 2, - batchSize, - 1, - value.prevOutValue, - frameSize, - grad.gateGrad, - frameSize * 3, - 1, - grad.gateWeightGrad, - frameSize * 2); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize * 2, +// batchSize, +// 1, +// value.prevOutValue, +// frameSize, +// grad.gateGrad, +// frameSize * 3, +// 1, +// grad.gateWeightGrad, +// frameSize * 2); } } } diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 91b4b8ed91..25e41edad5 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -13,18 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "MulOp.h" -/// todo(tianbing), delete it -#include -#include "paddle/math/MathFunctions.h" +#include "GemmFunctor.h" #include "paddle/math/SIMDFunctions.h" #include "paddle/utils/ThreadLocal.h" -#ifndef PADDLE_TYPE_DOUBLE -#define GEMM paddle::gemm -#else -#define GEMM paddle::gemm -#endif - namespace { inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { for (unsigned int i = 0; i < len; ++i) { @@ -114,19 +106,20 @@ void MulOp(CpuMatrix& out, real scaleT, bool aTrans, bool bTrans) { - GEMM(aTrans ? CblasTrans : CblasNoTrans, - bTrans ? CblasTrans : CblasNoTrans, - out.getHeight(), - out.getWidth(), - !aTrans ? a.getWidth() : a.getHeight(), - scaleAB, - a.getData(), - a.getStride(), - b.getData(), - b.getStride(), - scaleT, - out.getData(), - out.getStride()); + BlasGemm::compute( + aTrans, + bTrans, + out.getHeight(), + out.getWidth(), + !aTrans ? a.getWidth() : a.getHeight(), + scaleAB, + a.getData(), + a.getStride(), + b.getData(), + b.getStride(), + scaleT, + out.getData(), + out.getStride()); } /// dense matrix (+)= sparse matrix * dense matrix diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index c8ba1074a1..c2f17beeb8 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP) namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -143,6 +144,7 @@ void gemm(const CBLAS_TRANSPOSE transA, C, ldc); } +#endif template <> int getrf(const CBLAS_ORDER order, @@ -182,6 +184,7 @@ int getri(const CBLAS_ORDER order, return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv); } +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void axpy(const int n, const float alpha, const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); @@ -201,6 +204,7 @@ template <> double dotProduct(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); } +#endif #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML) diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index 637643838f..9297ae78c2 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -40,7 +40,14 @@ extern "C" { #ifndef LAPACK_FOUND extern "C" { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS #include +#else +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; +#endif int LAPACKE_sgetrf( int matrix_layout, int m, int n, float* a, int lda, int* ipiv); int LAPACKE_dgetrf( @@ -56,6 +63,7 @@ int LAPACKE_dgetri( namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA, const T beta, T* C, const int ldc); +#endif template int getrf(const CBLAS_ORDER Order, @@ -84,10 +93,20 @@ int getri( const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); template -void axpy(const int n, const T alpha, const T* x, T* y); +void axpy(const int n, const T alpha, const T* x, T* y) { + /// y = y + alpha * x + for (int i = 0; i < n; i++) { + y[i] = y[i] + alpha * x[i]; + } +} template -T dotProduct(const int n, const T* x, const T* y); +T dotProduct(const int n, const T* x, const T* y) { + T result = static_cast(0); + for (int i = 0; i < n; i++) { + result += x[i] * y[i]; + } +} template void vExp(const int n, const T* a, T* r); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 27f7d95b75..fbf3accc9a 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -28,6 +28,7 @@ limitations under the License. */ #include "hl_top_k.h" #include "paddle/utils/Logging.h" +#include "paddle/function/GemmFunctor.h" #include "paddle/utils/ThreadLocal.h" #include "SIMDFunctions.h" @@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; size_t a_col, b_col, a_row, b_row; - CBLAS_TRANSPOSE a_trans, b_trans; + // CBLAS_TRANSPOSE a_trans, b_trans; + bool a_trans, b_trans; if (!a->isTransposed()) { a_col = a->getWidth(); a_row = a->getHeight(); - a_trans = CblasNoTrans; + // a_trans = CblasNoTrans; + a_trans = false; } else { a_col = a->getHeight(); a_row = a->getWidth(); - a_trans = CblasTrans; + // a_trans = CblasTrans; + a_trans = true; } if (!b->isTransposed()) { b_col = b->getWidth(); b_row = b->getHeight(); - b_trans = CblasNoTrans; + // b_trans = CblasNoTrans; + b_trans = false; } else { b_col = b->getHeight(); b_row = b->getWidth(); - b_trans = CblasTrans; + // b_trans = CblasTrans; + b_trans = true; } CHECK_EQ(a_col, b_row); @@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int lda = a->getStride(); int ldb = b->getStride(); int ldc = getStride(); - gemm( + BlasGemm::compute( a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); } diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 593ae28e49..79f5ab12e9 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,9 +2,9 @@ set -xe -mkdir -p /paddle/build_android/$ANDROID_ABI -cd /paddle/build_android/$ANDROID_ABI -rm -rf /paddle/install 2>/dev/null || true +rm -rf /paddle/build_android 2>/dev/null || true +mkdir -p /paddle/build_android +cd /paddle/build_android THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI @@ -14,19 +14,25 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang++ \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ -DCMAKE_INSTALL_PREFIX=/paddle/install \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle -elif [ $ANDROID_ABI == "arm64-v7a" ]; then + -DWITH_STYLE_CHECK=OFF \ + .. +elif [ $ANDROID_ABI == "arm64-v8a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang++ \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ -DCMAKE_INSTALL_PREFIX=/paddle/install \ @@ -34,7 +40,7 @@ elif [ $ANDROID_ABI == "arm64-v7a" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ @@ -47,10 +53,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" fi -make -j `nproc` -make install -j `nproc` +make VERBOSE=1 +make install From f241773c4f1803631bba968bca1d5621a0d3ced5 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 23 Aug 2017 19:43:57 +0800 Subject: [PATCH 03/11] Support to use clang for Android cross-compiling. --- Dockerfile.android | 4 +- cmake/cblas.cmake | 4 + cmake/external/warpctc.cmake | 1 + paddle/cuda/include/hl_cpu_gru.cuh | 166 ++++++++++++------------- paddle/function/MulOp.cpp | 37 +++--- paddle/math/MathFunctions.cpp | 4 + paddle/math/MathFunctions.h | 23 +++- paddle/math/Matrix.cpp | 18 ++- paddle/scripts/docker/build_android.sh | 51 ++++++-- 9 files changed, 181 insertions(+), 127 deletions(-) diff --git a/Dockerfile.android b/Dockerfile.android index aa95abb366..6013215d9d 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -47,8 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \ wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-21 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \ rm -rf /opt/android-ndk-tmp && \ rm -rf ${ANDROID_NDK_HOME} diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 854066fd1d..ab111eccc0 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -13,6 +13,10 @@ # system paths. # +if(USE_EIGEN_FOR_BLAS) + return() +endif(USE_EIGEN_FOR_BLAS) + set(CBLAS_FOUND OFF) ## Find MKLML First. diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 2d7daed9bc..3cc652bed5 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -41,6 +41,7 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App ELSE() SET(USE_OMP ON) ENDIF() +SET(USE_OMP OFF FORCE) ExternalProject_Add( extern_warpctc diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh index c0a37ced2a..732799a28b 100644 --- a/paddle/cuda/include/hl_cpu_gru.cuh +++ b/paddle/cuda/include/hl_cpu_gru.cuh @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/math/MathFunctions.h" -#ifndef PADDLE_TYPE_DOUBLE -#define CBLAS_GEMM paddle::gemm -#else -#define CBLAS_GEMM paddle::gemm -#endif +// #ifndef PADDLE_TYPE_DOUBLE +// #define CBLAS_GEMM paddle::gemm +// #else +// #define CBLAS_GEMM paddle::gemm +// #endif template void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, @@ -219,37 +219,37 @@ void hl_cpu_gru_forward(OpResetOutput opResetOutput, hl_activation_mode_t active_node, hl_activation_mode_t active_gate) { if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - 2 * frameSize, - frameSize, - 1, - value.prevOutValue, - frameSize, - value.gateWeight, - frameSize * 2, - 1, - value.gateValue, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// 2 * frameSize, +// frameSize, +// 1, +// value.prevOutValue, +// frameSize, +// value.gateWeight, +// frameSize * 2, +// 1, +// value.gateValue, +// frameSize * 3); } forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate); if (value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasNoTrans, - batchSize, - frameSize, - frameSize, - 1, - value.resetOutputValue, - frameSize, - value.stateWeight, - frameSize, - 1, - value.gateValue + frameSize * 2, - frameSize * 3); +// CBLAS_GEMM(CblasNoTrans, +// CblasNoTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// value.resetOutputValue, +// frameSize, +// value.stateWeight, +// frameSize, +// 1, +// value.gateValue + frameSize * 2, +// frameSize * 3); } forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node); @@ -538,34 +538,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_node); if (value.prevOutValue && grad.prevOutGrad) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize, - 1, - grad.gateGrad + frameSize * 2, - frameSize * 3, - value.stateWeight, - frameSize, - 0, - grad.resetOutputGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize, +// 1, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// value.stateWeight, +// frameSize, +// 0, +// grad.resetOutputGrad, +// frameSize); if (grad.stateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize, - batchSize, - 1, - value.resetOutputValue, - frameSize, - grad.gateGrad + frameSize * 2, - frameSize * 3, - 1, - grad.stateWeightGrad, - frameSize); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize, +// batchSize, +// 1, +// value.resetOutputValue, +// frameSize, +// grad.gateGrad + frameSize * 2, +// frameSize * 3, +// 1, +// grad.stateWeightGrad, +// frameSize); } } @@ -573,34 +573,34 @@ void hl_cpu_gru_backward(OpStateGrad opStateGrad, frameSize, batchSize, active_gate); if (grad.prevOutGrad && value.prevOutValue) { - CBLAS_GEMM(CblasNoTrans, - CblasTrans, - batchSize, - frameSize, - frameSize * 2, - 1, - grad.gateGrad, - frameSize * 3, - value.gateWeight, - frameSize * 2, - 1, - grad.prevOutGrad, - frameSize); +// CBLAS_GEMM(CblasNoTrans, +// CblasTrans, +// batchSize, +// frameSize, +// frameSize * 2, +// 1, +// grad.gateGrad, +// frameSize * 3, +// value.gateWeight, +// frameSize * 2, +// 1, +// grad.prevOutGrad, +// frameSize); if (grad.gateWeightGrad) { - CBLAS_GEMM(CblasTrans, - CblasNoTrans, - frameSize, - frameSize * 2, - batchSize, - 1, - value.prevOutValue, - frameSize, - grad.gateGrad, - frameSize * 3, - 1, - grad.gateWeightGrad, - frameSize * 2); +// CBLAS_GEMM(CblasTrans, +// CblasNoTrans, +// frameSize, +// frameSize * 2, +// batchSize, +// 1, +// value.prevOutValue, +// frameSize, +// grad.gateGrad, +// frameSize * 3, +// 1, +// grad.gateWeightGrad, +// frameSize * 2); } } } diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp index 91b4b8ed91..25e41edad5 100644 --- a/paddle/function/MulOp.cpp +++ b/paddle/function/MulOp.cpp @@ -13,18 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "MulOp.h" -/// todo(tianbing), delete it -#include -#include "paddle/math/MathFunctions.h" +#include "GemmFunctor.h" #include "paddle/math/SIMDFunctions.h" #include "paddle/utils/ThreadLocal.h" -#ifndef PADDLE_TYPE_DOUBLE -#define GEMM paddle::gemm -#else -#define GEMM paddle::gemm -#endif - namespace { inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { for (unsigned int i = 0; i < len; ++i) { @@ -114,19 +106,20 @@ void MulOp(CpuMatrix& out, real scaleT, bool aTrans, bool bTrans) { - GEMM(aTrans ? CblasTrans : CblasNoTrans, - bTrans ? CblasTrans : CblasNoTrans, - out.getHeight(), - out.getWidth(), - !aTrans ? a.getWidth() : a.getHeight(), - scaleAB, - a.getData(), - a.getStride(), - b.getData(), - b.getStride(), - scaleT, - out.getData(), - out.getStride()); + BlasGemm::compute( + aTrans, + bTrans, + out.getHeight(), + out.getWidth(), + !aTrans ? a.getWidth() : a.getHeight(), + scaleAB, + a.getData(), + a.getStride(), + b.getData(), + b.getStride(), + scaleT, + out.getData(), + out.getStride()); } /// dense matrix (+)= sparse matrix * dense matrix diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index c8ba1074a1..c2f17beeb8 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP) namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -143,6 +144,7 @@ void gemm(const CBLAS_TRANSPOSE transA, C, ldc); } +#endif template <> int getrf(const CBLAS_ORDER order, @@ -182,6 +184,7 @@ int getri(const CBLAS_ORDER order, return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv); } +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template <> void axpy(const int n, const float alpha, const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); @@ -201,6 +204,7 @@ template <> double dotProduct(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); } +#endif #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML) diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index 637643838f..9297ae78c2 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -40,7 +40,14 @@ extern "C" { #ifndef LAPACK_FOUND extern "C" { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS #include +#else +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; +#endif int LAPACKE_sgetrf( int matrix_layout, int m, int n, float* a, int lda, int* ipiv); int LAPACKE_dgetrf( @@ -56,6 +63,7 @@ int LAPACKE_dgetri( namespace paddle { +#ifndef PADDLE_USE_EIGEN_FOR_BLAS template void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, @@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA, const T beta, T* C, const int ldc); +#endif template int getrf(const CBLAS_ORDER Order, @@ -84,10 +93,20 @@ int getri( const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); template -void axpy(const int n, const T alpha, const T* x, T* y); +void axpy(const int n, const T alpha, const T* x, T* y) { + /// y = y + alpha * x + for (int i = 0; i < n; i++) { + y[i] = y[i] + alpha * x[i]; + } +} template -T dotProduct(const int n, const T* x, const T* y); +T dotProduct(const int n, const T* x, const T* y) { + T result = static_cast(0); + for (int i = 0; i < n; i++) { + result += x[i] * y[i]; + } +} template void vExp(const int n, const T* a, T* r); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 27f7d95b75..fbf3accc9a 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -28,6 +28,7 @@ limitations under the License. */ #include "hl_top_k.h" #include "paddle/utils/Logging.h" +#include "paddle/function/GemmFunctor.h" #include "paddle/utils/ThreadLocal.h" #include "SIMDFunctions.h" @@ -2222,24 +2223,29 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; size_t a_col, b_col, a_row, b_row; - CBLAS_TRANSPOSE a_trans, b_trans; + // CBLAS_TRANSPOSE a_trans, b_trans; + bool a_trans, b_trans; if (!a->isTransposed()) { a_col = a->getWidth(); a_row = a->getHeight(); - a_trans = CblasNoTrans; + // a_trans = CblasNoTrans; + a_trans = false; } else { a_col = a->getHeight(); a_row = a->getWidth(); - a_trans = CblasTrans; + // a_trans = CblasTrans; + a_trans = true; } if (!b->isTransposed()) { b_col = b->getWidth(); b_row = b->getHeight(); - b_trans = CblasNoTrans; + // b_trans = CblasNoTrans; + b_trans = false; } else { b_col = b->getHeight(); b_row = b->getWidth(); - b_trans = CblasTrans; + // b_trans = CblasTrans; + b_trans = true; } CHECK_EQ(a_col, b_row); @@ -2256,7 +2262,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int lda = a->getStride(); int ldb = b->getStride(); int ldc = getStride(); - gemm( + BlasGemm::compute( a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); } diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 593ae28e49..a61c7c40e9 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,11 +2,31 @@ set -xe -mkdir -p /paddle/build_android/$ANDROID_ABI -cd /paddle/build_android/$ANDROID_ABI -rm -rf /paddle/install 2>/dev/null || true +COMPILER=gcc +USE_EIGEN=ON +if [ $COMPILER == clang ]; then + SUFFIX=_clang + C_COMPILER=clang + CXX_COMPILER=clang++ +else + SUFFIX=_gcc + C_COMPILER=gcc + CXX_COMPILER=g++ +fi +if [ $USE_EIGEN == ON ]; then + SUFFIX=${SUFFIX}_eigen +else + SUFFIX=${SUFFIX}_openblas +fi -THIRD_PARTY_PATH=/paddle/third_party_android/$ANDROID_ABI +BUILD_ROOT=/paddle/build_android$SUFFIX +DEST_ROOT=/paddle/install$SUFFIX + +rm -rf $BUILD_ROOT 2>/dev/null || true +mkdir -p $BUILD_ROOT +cd $BUILD_ROOT + +THIRD_PARTY_PATH=/paddle/third_party_android$SUFFIX/$ANDROID_ABI if [ $ANDROID_ABI == "armeabi-v7a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -14,27 +34,34 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-${C_COMPILER} \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-${CXX_COMPILER} \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ - -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=${USE_EIGEN} \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle -elif [ $ANDROID_ABI == "arm64-v7a" ]; then + -DWITH_STYLE_CHECK=OFF \ + .. +elif [ $ANDROID_ABI == "arm64-v8a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_MODE=ON \ + -DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-${C_COMPILER} \ + -DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-${CXX_COMPILER} \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ - -DCMAKE_INSTALL_PREFIX=/paddle/install \ + -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ + -DUSE_EIGEN_FOR_BLAS=${USE_EIGEN} \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ @@ -47,10 +74,10 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ - /paddle + .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" fi -make -j `nproc` -make install -j `nproc` +make VERBOSE=1 -j2 +make install -j2 From c54c7d91a0c098bf22ba399aee15ebb421de1bfb Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 1 Sep 2017 16:01:53 +0800 Subject: [PATCH 04/11] Use template to deliver const argument instead, to remove the compiling error "argument to __builtin_neon_vgetq_lane_f32 must be a constant integer". --- paddle/function/neon/NeonDepthwiseConv.cpp | 100 ++++++++++----------- paddle/function/neon/neon_util.h | 4 +- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp index f09e98587d..14e5198e1b 100644 --- a/paddle/function/neon/NeonDepthwiseConv.cpp +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); + tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); + tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); + tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); + tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); - tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); - tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); + tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); + tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); + tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); + tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h index 56b3febe2d..dbe017170b 100644 --- a/paddle/function/neon/neon_util.h +++ b/paddle/function/neon/neon_util.h @@ -33,10 +33,10 @@ inline float32_t vaddvq_f32(float32x4_t a) { return vget_lane_f32(vpadd_f32(v, v), 0); } +template inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, - float32x4_t v, - const int lane) { + float32x4_t v) { return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); } #endif From 8b15ac82fa831f95493c2bd218b93655db0d739e Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 1 Sep 2017 17:50:01 +0800 Subject: [PATCH 05/11] Move the definition of hl_cpu_gru_forward and hl_cpu_gru_backward to function/GruFunctor.h. --- paddle/cuda/include/hl_cpu_gru.cuh | 134 --------------------- paddle/function/GruFunctor.h | 160 +++++++++++++++++++++++++ paddle/gserver/layers/GruCompute.cpp | 32 ++--- paddle/scripts/docker/build_android.sh | 25 +--- 4 files changed, 181 insertions(+), 170 deletions(-) create mode 100644 paddle/function/GruFunctor.h diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh index 732799a28b..347b038598 100644 --- a/paddle/cuda/include/hl_cpu_gru.cuh +++ b/paddle/cuda/include/hl_cpu_gru.cuh @@ -18,14 +18,6 @@ limitations under the License. */ #ifndef __NVCC__ -#include "paddle/math/MathFunctions.h" - -// #ifndef PADDLE_TYPE_DOUBLE -// #define CBLAS_GEMM paddle::gemm -// #else -// #define CBLAS_GEMM paddle::gemm -// #endif - template void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, real *gateValue, @@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput, } } -template -void hl_cpu_gru_forward(OpResetOutput opResetOutput, - OpFinalOutput opFinalOutput, - hl_gru_value value, - int frameSize, - int batchSize, - hl_activation_mode_t active_node, - hl_activation_mode_t active_gate) { - if (value.prevOutValue) { -// CBLAS_GEMM(CblasNoTrans, -// CblasNoTrans, -// batchSize, -// 2 * frameSize, -// frameSize, -// 1, -// value.prevOutValue, -// frameSize, -// value.gateWeight, -// frameSize * 2, -// 1, -// value.gateValue, -// frameSize * 3); - } - - forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate); - - if (value.prevOutValue) { -// CBLAS_GEMM(CblasNoTrans, -// CblasNoTrans, -// batchSize, -// frameSize, -// frameSize, -// 1, -// value.resetOutputValue, -// frameSize, -// value.stateWeight, -// frameSize, -// 1, -// value.gateValue + frameSize * 2, -// frameSize * 3); - } - - forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node); -} - template void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, real *gateValue, @@ -524,87 +471,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad, } } } - -template -void hl_cpu_gru_backward(OpStateGrad opStateGrad, - OpResetGrad opResetGrad, - hl_gru_value value, - hl_gru_grad grad, - int frameSize, - int batchSize, - hl_activation_mode_t active_node, - hl_activation_mode_t active_gate) { - backward_state_grad(opStateGrad, value, grad, - frameSize, batchSize, active_node); - - if (value.prevOutValue && grad.prevOutGrad) { -// CBLAS_GEMM(CblasNoTrans, -// CblasTrans, -// batchSize, -// frameSize, -// frameSize, -// 1, -// grad.gateGrad + frameSize * 2, -// frameSize * 3, -// value.stateWeight, -// frameSize, -// 0, -// grad.resetOutputGrad, -// frameSize); - - if (grad.stateWeightGrad) { -// CBLAS_GEMM(CblasTrans, -// CblasNoTrans, -// frameSize, -// frameSize, -// batchSize, -// 1, -// value.resetOutputValue, -// frameSize, -// grad.gateGrad + frameSize * 2, -// frameSize * 3, -// 1, -// grad.stateWeightGrad, -// frameSize); - } - } - - backward_reset_grad(opResetGrad, value, grad, - frameSize, batchSize, active_gate); - - if (grad.prevOutGrad && value.prevOutValue) { -// CBLAS_GEMM(CblasNoTrans, -// CblasTrans, -// batchSize, -// frameSize, -// frameSize * 2, -// 1, -// grad.gateGrad, -// frameSize * 3, -// value.gateWeight, -// frameSize * 2, -// 1, -// grad.prevOutGrad, -// frameSize); - - if (grad.gateWeightGrad) { -// CBLAS_GEMM(CblasTrans, -// CblasNoTrans, -// frameSize, -// frameSize * 2, -// batchSize, -// 1, -// value.prevOutValue, -// frameSize, -// grad.gateGrad, -// frameSize * 3, -// 1, -// grad.gateWeightGrad, -// frameSize * 2); - } - } -} - #endif #endif // HL_CPU_GRU_CUH_ diff --git a/paddle/function/GruFunctor.h b/paddle/function/GruFunctor.h new file mode 100644 index 0000000000..11f6174dbd --- /dev/null +++ b/paddle/function/GruFunctor.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "GemmFunctor.h" +#include "GruFunctor.h" +#include "hl_cpu_gru.cuh" + +namespace paddle { + +template +struct GruFunctor { + template + static void compute(OpResetOutput opResetOutput, + OpFinalOutput opFinalOutput, + hl_gru_value value, + int frameSize, + int batchSize, + hl_activation_mode_t active_node, + hl_activation_mode_t active_gate) { +#ifndef __NVCC__ + if (value.prevOutValue) { + BlasGemm::compute(false, + false, + batchSize, + 2 * frameSize, + frameSize, + 1, + value.prevOutValue, + frameSize, + value.gateWeight, + frameSize * 2, + 1, + value.gateValue, + frameSize * 3); + } + + forward_reset_output( + opResetOutput, value, frameSize, batchSize, active_gate); + + if (value.prevOutValue) { + BlasGemm::compute(false, + false, + batchSize, + frameSize, + frameSize, + 1, + value.resetOutputValue, + frameSize, + value.stateWeight, + frameSize, + 1, + value.gateValue + frameSize * 2, + frameSize * 3); + } + + forward_final_output( + opFinalOutput, value, frameSize, batchSize, active_node); +#endif + } +}; + +template +struct GruGradFunctor { + template + static void compute(OpStateGrad opStateGrad, + OpResetGrad opResetGrad, + hl_gru_value value, + hl_gru_grad grad, + int frameSize, + int batchSize, + hl_activation_mode_t active_node, + hl_activation_mode_t active_gate) { +#ifndef __NVCC__ + backward_state_grad( + opStateGrad, value, grad, frameSize, batchSize, active_node); + + if (value.prevOutValue && grad.prevOutGrad) { + BlasGemm::compute(false, + true, + batchSize, + frameSize, + frameSize, + 1, + grad.gateGrad + frameSize * 2, + frameSize * 3, + value.stateWeight, + frameSize, + 0, + grad.resetOutputGrad, + frameSize); + + if (grad.stateWeightGrad) { + BlasGemm::compute(true, + false, + frameSize, + frameSize, + batchSize, + 1, + value.resetOutputValue, + frameSize, + grad.gateGrad + frameSize * 2, + frameSize * 3, + 1, + grad.stateWeightGrad, + frameSize); + } + } + + backward_reset_grad( + opResetGrad, value, grad, frameSize, batchSize, active_gate); + + if (grad.prevOutGrad && value.prevOutValue) { + BlasGemm::compute(false, + true, + batchSize, + frameSize, + frameSize * 2, + 1, + grad.gateGrad, + frameSize * 3, + value.gateWeight, + frameSize * 2, + 1, + grad.prevOutGrad, + frameSize); + + if (grad.gateWeightGrad) { + BlasGemm::compute(true, + false, + frameSize, + frameSize * 2, + batchSize, + 1, + value.prevOutValue, + frameSize, + grad.gateGrad, + frameSize * 3, + 1, + grad.gateWeightGrad, + frameSize * 2); + } + } +#endif + } +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp index 06907768e9..148516391c 100644 --- a/paddle/gserver/layers/GruCompute.cpp +++ b/paddle/gserver/layers/GruCompute.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include "GruCompute.h" #include "hl_recurrent_apply.cuh" +#include "paddle/function/GruFunctor.h" #include "paddle/utils/Util.h" namespace paddle { @@ -25,13 +26,13 @@ void GruCompute::init(LayerConfig &config) { template <> void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) { - hl_cpu_gru_forward(hppl::forward::gru_resetOutput(), - hppl::forward::gru_finalOutput(), - value, - frameSize, - batchSize, - activeNode_, - activeGate_); + GruFunctor::compute(hppl::forward::gru_resetOutput(), + hppl::forward::gru_finalOutput(), + value, + frameSize, + batchSize, + activeNode_, + activeGate_); } template <> @@ -39,14 +40,15 @@ void GruCompute::backward<0>(hl_gru_value value, hl_gru_grad grad, int frameSize, int batchSize) { - hl_cpu_gru_backward(hppl::backward::gru_stateGrad(), - hppl::backward::gru_resetGrad(), - value, - grad, - frameSize, - batchSize, - activeNode_, - activeGate_); + GruGradFunctor::compute( + hppl::backward::gru_stateGrad(), + hppl::backward::gru_resetGrad(), + value, + grad, + frameSize, + batchSize, + activeNode_, + activeGate_); } } // namespace paddle diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index a61c7c40e9..34e31f1394 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,25 +2,8 @@ set -xe -COMPILER=gcc -USE_EIGEN=ON -if [ $COMPILER == clang ]; then - SUFFIX=_clang - C_COMPILER=clang - CXX_COMPILER=clang++ -else - SUFFIX=_gcc - C_COMPILER=gcc - CXX_COMPILER=g++ -fi -if [ $USE_EIGEN == ON ]; then - SUFFIX=${SUFFIX}_eigen -else - SUFFIX=${SUFFIX}_openblas -fi - -BUILD_ROOT=/paddle/build_android$SUFFIX -DEST_ROOT=/paddle/install$SUFFIX +BUILD_ROOT=/paddle/build_android +DEST_ROOT=/paddle/install rm -rf $BUILD_ROOT 2>/dev/null || true mkdir -p $BUILD_ROOT @@ -41,7 +24,7 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ - -DUSE_EIGEN_FOR_BLAS=${USE_EIGEN} \ + -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ -DWITH_STYLE_CHECK=OFF \ @@ -58,7 +41,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ - -DUSE_EIGEN_FOR_BLAS=${USE_EIGEN} \ + -DUSE_EIGEN_FOR_BLAS=OFF \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ .. From 7939705384751b7fbbcf6d9c334363b8f7fbd763 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 1 Sep 2017 18:34:18 +0800 Subject: [PATCH 06/11] Add the missing return statement. --- paddle/math/MathFunctions.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index 9297ae78c2..e8ea6e37ac 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -106,6 +106,7 @@ T dotProduct(const int n, const T* x, const T* y) { for (int i = 0; i < n; i++) { result += x[i] * y[i]; } + return result; } template From 8e5f54320fceca8e031d070d6a6f406f271845fe Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 4 Sep 2017 11:43:52 +0800 Subject: [PATCH 07/11] Refine the toolchain file of Android to use clang as default compiler. --- cmake/cross_compiling/android.cmake | 73 +++++++++++++++++++------- cmake/external/warpctc.cmake | 1 - paddle/math/Matrix.cpp | 5 -- paddle/scripts/docker/build_android.sh | 9 ---- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 5e3e437a8d..84219cfa55 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -20,6 +20,7 @@ # The supported variables are listed belows: # # ANDROID_STANDALONE_TOOLCHAIN +# ANDROID_TOOLCHAIN # ANDROID_ABI # ANDROID_NATIVE_API_LEVEL # ANDROID_ARM_MODE @@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL) ENDIF() ENDIF() +IF(NOT DEFINED ANDROID_TOOLCHAIN) + SET(ANDROID_TOOLCHAIN clang) +ENDIF() + IF(NOT DEFINED ANDROID_ABI) SET(ANDROID_ABI "armeabi-v7a") ENDIF() @@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") "${CMAKE_VERSION}), when cross-compiling for Android.") IF(ANDROID_STANDALONE_TOOLCHAIN) + # Use standalone toolchain SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot") IF(NOT CMAKE_SYSTEM_VERSION) @@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ENDIF() # Toolchain - SET(ANDROID_TOOLCHAIN "gcc") SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN}) - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) - IF(ANDROID_ABI STREQUAL "armeabi") - SET(CMAKE_SYSTEM_PROCESSOR armv5te) - ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") - SET(CMAKE_SYSTEM_PROCESSOR armv7-a) - ENDIF() - ENDIF() - IF(ANDROID_ABI STREQUAL "arm64-v8a") - SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) - SET(CMAKE_SYSTEM_PROCESSOR aarch64) + ELSE(ANDROID_NDK) + # TODO: use android ndk + ENDIF() + + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) + IF(ANDROID_ABI STREQUAL "armeabi") + SET(CMAKE_SYSTEM_PROCESSOR armv5te) + SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi) + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_SYSTEM_PROCESSOR armv7-a) + SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi) ENDIF() - SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) + SET(CMAKE_SYSTEM_PROCESSOR aarch64) + SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.") + ENDIF() + SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") + + IF(ANDROID_TOOLCHAIN STREQUAL clang) + SET(ANDROID_C_COMPILER_NAME clang) + SET(ANDROID_CXX_COMPILER_NAME clang++) + SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc) + SET(ANDROID_C_COMPILER_NAME gcc) + SET(ANDROID_CXX_COMPILER_NAME g++) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}") ENDIF() # C compiler IF(NOT CMAKE_C_COMPILER) - SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc") + SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}") ELSE() GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) ENDIF() @@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") # CXX compiler IF(NOT CMAKE_CXX_COMPILER) - SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++") + SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}") ELSE() GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) ENDIF() @@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) # Toolchain and ABI specific flags. - SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64") + SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections") SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections") IF(ANDROID_ABI STREQUAL "armeabi") @@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") -march=armv5te -mtune=xscale -msoft-float) - ENDIF() - IF(ANDROID_ABI STREQUAL "armeabi-v7a") + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv7-a -mfloat-abi=softfp) @@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16) ENDIF() LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) ENDIF() IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") @@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ELSE() LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb) ENDIF() + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # Disable integrated-as for better compatibility. + LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as) + ENDIF() ENDIF() - IF(ANDROID_ABI STREQUAL "arm64-v8a") - LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # CMake automatically forwards all compiler flags to the linker, + # and clang doesn't like having -Wa flags being used for linking. + # To prevent CMake from doing this would require meddling with + # the CMAKE__COMPILE_OBJECT rules, which would get quite messy. + LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments) ENDIF() STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 3cc652bed5..2d7daed9bc 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -41,7 +41,6 @@ IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App ELSE() SET(USE_OMP ON) ENDIF() -SET(USE_OMP OFF FORCE) ExternalProject_Add( extern_warpctc diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 82d598d885..4a2132c8d1 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -2774,28 +2774,23 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; size_t a_col, b_col, a_row, b_row; - // CBLAS_TRANSPOSE a_trans, b_trans; bool a_trans, b_trans; if (!a->isTransposed()) { a_col = a->getWidth(); a_row = a->getHeight(); - // a_trans = CblasNoTrans; a_trans = false; } else { a_col = a->getHeight(); a_row = a->getWidth(); - // a_trans = CblasTrans; a_trans = true; } if (!b->isTransposed()) { b_col = b->getWidth(); b_row = b->getHeight(); - // b_trans = CblasNoTrans; b_trans = false; } else { b_col = b->getHeight(); b_row = b->getWidth(); - // b_trans = CblasTrans; b_trans = true; } diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 98f66fa6f1..512a37166c 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -9,20 +9,15 @@ rm -rf $BUILD_ROOT 2>/dev/null || true mkdir -p $BUILD_ROOT cd $BUILD_ROOT -THIRD_PARTY_PATH=/paddle/third_party_android$SUFFIX/$ANDROID_ABI - if [ $ANDROID_ABI == "armeabi-v7a" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ - -DCMAKE_C_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang \ - -DCMAKE_CXX_COMPILER=$ANDROID_ARM_STANDALONE_TOOLCHAIN/bin/arm-linux-androideabi-clang++ \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ - -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ @@ -34,12 +29,9 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=$ANDROID_ABI \ -DANDROID_ARM_MODE=ON \ - -DCMAKE_C_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang \ - -DCMAKE_CXX_COMPILER=$ANDROID_ARM64_STANDALONE_TOOLCHAIN/bin/aarch64-linux-android-clang++ \ -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ - -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ -DUSE_EIGEN_FOR_BLAS=OFF \ -DWITH_C_API=ON \ @@ -53,7 +45,6 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_CXX_COMPILER=/usr/bin/g++ \ -DCMAKE_INSTALL_PREFIX=/paddle/install \ - -DTHIRD_PARTY_PATH=$THIRD_PARTY_PATH \ -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ From 387b4e4eac4d7cd7d4f5f18b52d74f6b8f9601bd Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 4 Sep 2017 11:50:52 +0800 Subject: [PATCH 08/11] Change the declaration of EigenGemm from `class` to `struct`. --- paddle/function/EigenGemm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp index 674141ed39..b3e666e860 100644 --- a/paddle/function/EigenGemm.cpp +++ b/paddle/function/EigenGemm.cpp @@ -83,9 +83,9 @@ struct EigenBlasGemm { }; #ifdef PADDLE_TYPE_DOUBLE -template class EigenBlasGemm; +template struct EigenBlasGemm; #else -template class EigenBlasGemm; +template struct EigenBlasGemm; #endif } // namespace paddle From 9293dc48179ae34d182f420c4500967d02238636 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 4 Sep 2017 12:29:32 +0800 Subject: [PATCH 09/11] Move the third_party_android from cache directories in travis. --- .travis.yml | 1 - paddle/scripts/travis/build_android.sh | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b4b83fcdbc..cc2036df5a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ cache: - $HOME/.ccache - $HOME/.cache/pip - $TRAVIS_BUILD_DIR/build/third_party - - $TRAVIS_BUILD_DIR/build_android/third_party sudo: required dist: trusty os: diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh index 004067a8f5..9da71d1e8c 100755 --- a/paddle/scripts/travis/build_android.sh +++ b/paddle/scripts/travis/build_android.sh @@ -22,6 +22,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_ABI=armeabi-v7a \ -DANDROID_ARM_NEON=ON \ -DANDROID_ARM_MODE=ON \ + -DUSE_EIGEN_FOR_BLAS=ON \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ -DWITH_STYLE_CHECK=OFF \ From a98c9e6bbf27dba8377d4f709bfc0aa2e71b8148 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 4 Sep 2017 13:25:00 +0800 Subject: [PATCH 10/11] Add third_party_android back to the cache directories to speedup travis. --- .travis.yml | 1 + cmake/cblas.cmake | 4 ---- cmake/external/openblas.cmake | 4 ++++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index cc2036df5a..14a39c58de 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ cache: - $HOME/.ccache - $HOME/.cache/pip - $TRAVIS_BUILD_DIR/build/third_party + - $TRAVIS_BUILD_DIR/build/third_party_android sudo: required dist: trusty os: diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index ab111eccc0..854066fd1d 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -13,10 +13,6 @@ # system paths. # -if(USE_EIGEN_FOR_BLAS) - return() -endif(USE_EIGEN_FOR_BLAS) - set(CBLAS_FOUND OFF) ## Find MKLML First. diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 0002a470d9..f9e05af59f 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +IF(USE_EIGEN_FOR_BLAS) + return() +ENDIF(USE_EIGEN_FOR_BLAS) + INCLUDE(cblas) IF(NOT ${CBLAS_FOUND}) From 3360e9cdb8151baa33c3e82840fae2d105085a46 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Thu, 7 Sep 2017 11:06:32 +0800 Subject: [PATCH 11/11] Change the definition of vmlaq_laneq_f32 from template function to macro. --- .travis.yml | 2 +- Dockerfile.android | 4 +- paddle/function/GruFunctor.h | 1 - paddle/function/neon/NeonDepthwiseConv.cpp | 100 ++++++++++----------- paddle/function/neon/neon_util.h | 8 +- paddle/scripts/docker/build_android.sh | 6 +- 6 files changed, 59 insertions(+), 62 deletions(-) diff --git a/.travis.yml b/.travis.yml index 14a39c58de..b4b83fcdbc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ cache: - $HOME/.ccache - $HOME/.cache/pip - $TRAVIS_BUILD_DIR/build/third_party - - $TRAVIS_BUILD_DIR/build/third_party_android + - $TRAVIS_BUILD_DIR/build_android/third_party sudo: required dist: trusty os: diff --git a/Dockerfile.android b/Dockerfile.android index 6013215d9d..452aa15745 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -11,8 +11,8 @@ ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} ENV HOME=/root \ ANDROID_NDK_HOME=/opt/android-ndk-linux \ - ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain-gcc \ - ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain-gcc + ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \ + ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain RUN apt-get update && \ apt-get install -y \ diff --git a/paddle/function/GruFunctor.h b/paddle/function/GruFunctor.h index 11f6174dbd..9f6392198e 100644 --- a/paddle/function/GruFunctor.h +++ b/paddle/function/GruFunctor.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "GemmFunctor.h" -#include "GruFunctor.h" #include "hl_cpu_gru.cuh" namespace paddle { diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp index 14e5198e1b..f09e98587d 100644 --- a/paddle/function/neon/NeonDepthwiseConv.cpp +++ b/paddle/function/neon/NeonDepthwiseConv.cpp @@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); - tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); - tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<0>(tmp2, input[1][0], k[1]); - tmp1 = vmlaq_laneq_f32<1>(tmp1, input[1][1], k[1]); - tmp2 = vmlaq_laneq_f32<2>(tmp2, input[1][2], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); @@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> { float32x4_t tmp1 = vdupq_n_f32(0.f); float32x4_t tmp2 = vdupq_n_f32(0.f); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[0][0], k[0]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[0][1], k[0]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[0][2], k[0]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[0][3], k[0]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[1][0], k[1]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[1][1], k[1]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[1][2], k[1]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[1][3], k[1]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[2][0], k[2]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[2][1], k[2]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[2][2], k[2]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[2][3], k[2]); - tmp1 = vmlaq_laneq_f32<0>(tmp1, input[3][0], k[3]); - tmp2 = vmlaq_laneq_f32<1>(tmp2, input[3][1], k[3]); - tmp1 = vmlaq_laneq_f32<2>(tmp1, input[3][2], k[3]); - tmp2 = vmlaq_laneq_f32<3>(tmp2, input[3][3], k[3]); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1); + tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2); + tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3); tmp1 = vaddq_f32(tmp1, tmp2); vst1q_f32(outputData, tmp1); diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h index dbe017170b..e2db045067 100644 --- a/paddle/function/neon/neon_util.h +++ b/paddle/function/neon/neon_util.h @@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) { return vget_lane_f32(vpadd_f32(v, v), 0); } -template -inline float32x4_t vmlaq_laneq_f32(float32x4_t a, - float32x4_t b, - float32x4_t v) { - return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)); -} +#define vmlaq_laneq_f32(a, b, v, lane) \ + vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane)) #endif } // namespace neon diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index 512a37166c..aabd2da5e4 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -36,6 +36,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then -DUSE_EIGEN_FOR_BLAS=OFF \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ .. elif [ $ANDROID_ABI == "armeabi" ]; then cmake -DCMAKE_SYSTEM_NAME=Android \ @@ -48,10 +49,11 @@ elif [ $ANDROID_ABI == "armeabi" ]; then -DCMAKE_BUILD_TYPE=Release \ -DWITH_C_API=ON \ -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ .. else echo "Invalid ANDROID_ABI: $ANDROID_ABI" fi -make VERBOSE=1 -j2 -make install -j2 +make -j `nproc` +make install -j `nproc`