From dd7a79158b17f3613ff66b9c4db7691074fb6218 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 21 Aug 2018 19:54:10 +0800 Subject: [PATCH 01/21] add scope info in graphviz debug --- .../fluid/framework/details/multi_devices_graph_print_pass.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index 69944a42b6..361c91dc78 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -54,7 +54,8 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_ << "\\n" << var_handle_ptr->place_ << "\\n" - << var_handle_ptr->version_ << "\"]" << std::endl; + << "scope: " << var_handle_ptr->scope_idx_ << "\\n" + << "v" << var_handle_ptr->version_ << "\"]" << std::endl; } else if (dummy_ptr) { sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; } From f72ab8961e443c030a50f373f21eceac3800f528 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 11:43:20 +0800 Subject: [PATCH 02/21] refine blas gemm --- CMakeLists.txt | 6 -- paddle/fluid/operators/math/blas.h | 9 ++ paddle/fluid/operators/math/blas_impl.h | 116 +++++++++++------------ paddle/fluid/operators/math/fc_compute.h | 22 +++-- 4 files changed, 77 insertions(+), 76 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 48e52961a9..317f7f9eb4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,12 +138,6 @@ else() set(THIRD_PARTY_BUILD_TYPE Release) endif() -if(WITH_MKL) - option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF) - if (MKL_SPLIT_GEMM) - add_definitions(-DPADDLE_MKL_SPLIT_GEMM) - endif() -endif() set(WITH_MKLML ${WITH_MKL}) if (NOT DEFINED WITH_MKLDNN) if (WITH_MKL AND AVX2_FOUND) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 8dcf7c99f3..295431347a 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -90,6 +90,11 @@ class Blas { void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C, int ldc) const; + template + void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C, + int ldc) const; + #ifdef PADDLE_WITH_MKLML template T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N, @@ -109,6 +114,10 @@ class Blas { void GEMM_FREE(T* data) const; #endif + template + void MatMul(const int M, const int N, const int K, const T* A, const T* B, + T* C) const; + template void MatMul(const framework::Tensor& mat_a, bool trans_a, const framework::Tensor& mat_b, bool trans_b, T alpha, diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index dc77b6d793..d39a3e7f6e 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -217,64 +217,6 @@ struct CBlas { #endif }; -template -inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa, - bool transb, const T &alpha, const T &beta) { -#ifdef PADDLE_WITH_LIBXSMM - // Refer to https://github.com/hfp/libxsmm/blob/master/README.md - // But the threshold is custom - constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20; - if (m * n * k > LIBXSMM_THRESHOLD || transa || transb || - std::abs(alpha - static_cast(1) > - std::numeric_limits::epsilon()) || - std::abs(beta) > std::numeric_limits::epsilon()) { - return false; - } else { - return true; - } -#endif - return false; -} - -template <> -inline bool UseXSMM(const int &m, const int &n, const int &k, - bool transa, bool transb, - const platform::float16 &alpha, - const platform::float16 &beta) { - return false; -} - -template -inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, - int ldc) { -#ifdef PADDLE_WITH_LIBXSMM - if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, - beta)) { - // Note: SMM use ColMajor - const char transa = 'N'; - const char transb = 'N'; - CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda, - &beta, C, &ldc); - return; - } -#endif - -#ifdef PADDLE_MKL_SPLIT_GEMM - constexpr int bs = 2; - if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) { - for (int off = 0; off < M; off += bs) { - CBlas::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha, - A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc); - } - return; - } -#endif - CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); -} - #ifdef PADDLE_WITH_MKLML template <> template @@ -319,8 +261,8 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; - GEMM_WARP(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); + CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); } template <> @@ -329,9 +271,20 @@ void Blas::GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T *A, int lda, const T *B, int ldb, T beta, T *C, int ldc) const { - GEMM_WARP(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, - transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, - lda, B, ldb, beta, C, ldc); + CBlas::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, + lda, B, ldb, beta, C, ldc); +} + +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, + int N, int K, T alpha, const T *A, + int lda, const T *B, int ldb, + T beta, T *C, int ldc) const { + CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); } template @@ -440,6 +393,43 @@ void Blas::BatchedGEMM( #endif } +template +template +void Blas::MatMul(const int M, const int N, const int K, + const T *A, const T *B, T *C) const { + this->template GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, + static_cast(1), A, K, B, N, static_cast(0), C, + N); +} + +template <> +template +void Blas::MatMul(const int M, const int N, + const int K, const T *A, + const T *B, T *C) const { +#ifdef PADDLE_WITH_LIBXSMM + // Refer to https://github.com/hfp/libxsmm/blob/master/README.md + // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20; + + // Since the matrix is very small, + // so the unit of calculation is already very fast, + // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead, + // use xsmm directly. + // Note: SMM use ColMajor + const char transa = 'N'; + const char transb = 'N'; + const T alpha = static_cast(1); + const T beta = static_cast(0); + CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, + C, &N); + return; + +#endif + + CBlas::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, + static_cast(1), A, K, B, N, static_cast(0), C, N); +} + template template void Blas::MatMul(const framework::Tensor &mat_a, diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 8600fa9e2c..1f5a49c0ab 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -25,17 +25,25 @@ namespace math { template inline void FCCompute(const BlasT& blas, const int M, const int N, const int K, const T* X, const T* W, T* Y, - const T* B = NULL) { - blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast(1), X, W, - static_cast(0), Y); - if (B) { + const T* B = NULL, bool relu = false) { + blas.MatMul(M, N, K, X, W, Y); + if (B == NULL) { + return; + } + #ifdef PADDLE_WITH_MKLML #pragma omp parallel for if (FLAGS_paddle_num_threads > 1) #endif - for (int i = 0; i < M; i++) { - blas.AXPY(N, static_cast(1), B, Y + i * N); - } + for (int i = 0; i < M; i++) { + blas.AXPY(N, static_cast(1), B, Y + i * N); } + + if (!relu) { + return; + } + + // TODO(TJ): fuse relu + LOG(FATAL) << "Not implemented!"; } } // namespace math From a2203d0466462fcde20bdd80d79a0f7964760eb8 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 12:08:31 +0800 Subject: [PATCH 03/21] add cblas dot --- paddle/fluid/operators/math/blas.h | 3 +++ paddle/fluid/operators/math/blas_impl.h | 27 ++++++++++++++++++++++++- paddle/fluid/platform/dynload/mklml.h | 2 ++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 295431347a..96d481f739 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -153,6 +153,9 @@ class Blas { void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta, T* C) const; + template + T DOT(int n, const T* x, const T* y) const; + template void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha, const T* A, const T* B, T beta, T* C, diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index d39a3e7f6e..bbd9d4b60a 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -73,6 +73,11 @@ struct CBlas { platform::dynload::cblas_sgemv(args...); } + template + static float DOT(ARGS... args) { + return platform::dynload::cblas_sdot(args...); + } + template static void GEMM_BATCH(ARGS... args) { platform::dynload::cblas_sgemm_batch(args...); @@ -138,6 +143,11 @@ struct CBlas { platform::dynload::cblas_dgemv(args...); } + template + static double DOT(ARGS... args) { + return platform::dynload::cblas_ddot(args...); + } + template static void GEMM_BATCH(ARGS... args) { platform::dynload::cblas_dgemm_batch(args...); @@ -210,6 +220,7 @@ struct CBlas { PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); } static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } + static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); @@ -352,6 +363,21 @@ void Blas::VMUL(int n, const T *x, const T *y, #endif } +template <> +template +T Blas::DOT(int n, const T *x, const T *y) const { +#ifdef PADDLE_WITH_MKLML + return CBlas::DOT(n, x, y); +#else + // try to find if openblas support cblas_dot + T sum = 0; + for (int i = 0; i < n; ++i) { + sum += x[i] * y[i]; + } + return sum; +#endif +} + template <> template void Blas::GEMV(bool trans_a, int M, int N, T alpha, @@ -423,7 +449,6 @@ void Blas::MatMul(const int M, const int N, CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N); return; - #endif CBlas::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index 15ad4a3b40..6efa160df0 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -66,6 +66,8 @@ extern void* mklml_dso_handle; __macro(cblas_dgemm_free); \ __macro(cblas_sgemm_batch); \ __macro(cblas_dgemm_batch); \ + __macro(cblas_sdot); \ + __macro(cblas_ddot); \ __macro(vsAdd); \ __macro(vdAdd); \ __macro(vsMul); \ From 0ec1f65cf110ee4e73a7bfa03456b52111426288 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 12:47:10 +0800 Subject: [PATCH 04/21] fix blas dot and add cblas scal --- paddle/fluid/operators/math/blas.h | 3 +++ paddle/fluid/operators/math/blas_impl.h | 27 ++++++++++++++++++++++++- paddle/fluid/platform/dynload/mklml.h | 2 ++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 96d481f739..fc02534a69 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -156,6 +156,9 @@ class Blas { template T DOT(int n, const T* x, const T* y) const; + template + void SCAL(int n, const T a, const T* x) const; + template void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha, const T* A, const T* B, T beta, T* C, diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index bbd9d4b60a..b7c56e8df1 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -78,6 +78,11 @@ struct CBlas { return platform::dynload::cblas_sdot(args...); } + template + static void SCAL(ARGS... args) { + platform::dynload::cblas_sscal(args...); + } + template static void GEMM_BATCH(ARGS... args) { platform::dynload::cblas_sgemm_batch(args...); @@ -148,6 +153,11 @@ struct CBlas { return platform::dynload::cblas_ddot(args...); } + template + static void SCAL(ARGS... args) { + platform::dynload::cblas_dscal(args...); + } + template static void GEMM_BATCH(ARGS... args) { platform::dynload::cblas_dgemm_batch(args...); @@ -221,6 +231,7 @@ struct CBlas { } static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; + static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); @@ -367,7 +378,7 @@ template <> template T Blas::DOT(int n, const T *x, const T *y) const { #ifdef PADDLE_WITH_MKLML - return CBlas::DOT(n, x, y); + return CBlas::DOT(n, x, 1, y, 1); #else // try to find if openblas support cblas_dot T sum = 0; @@ -378,6 +389,20 @@ T Blas::DOT(int n, const T *x, const T *y) const { #endif } +template <> +template +void Blas::SCAL(int n, const T a, + const T *x) const { +#ifdef PADDLE_WITH_MKLML + CBlas::SCAL(n, a, x, 1); +#else + // try to find if openblas support cblas_scal + for (int i = 0; i < n; ++i) { + x[i] = a * x[i]; + } +#endif +} + template <> template void Blas::GEMV(bool trans_a, int M, int N, T alpha, diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index 6efa160df0..e50ea6740a 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -68,6 +68,8 @@ extern void* mklml_dso_handle; __macro(cblas_dgemm_batch); \ __macro(cblas_sdot); \ __macro(cblas_ddot); \ + __macro(cblas_sscal); \ + __macro(cblas_dscal); \ __macro(vsAdd); \ __macro(vdAdd); \ __macro(vsMul); \ From 3dd66390b2702fe3083fee5e84f2ad6d5322b76b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 13:13:58 +0800 Subject: [PATCH 05/21] add blas vexp --- paddle/fluid/operators/math/blas.h | 3 +++ paddle/fluid/operators/math/blas_impl.h | 24 ++++++++++++++++++++++++ paddle/fluid/platform/dynload/mklml.h | 2 ++ 3 files changed, 29 insertions(+) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index fc02534a69..5aba170221 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -149,6 +149,9 @@ class Blas { template void VCOPY(int n, const T* x, T* y) const; + template + void VEXP(int n, const T* x, T* y) const; + template void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta, T* C) const; diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index b7c56e8df1..eaad83ba18 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -97,6 +97,11 @@ struct CBlas { static void VMUL(ARGS... args) { platform::dynload::vsMul(args...); } + + template + static void VEXP(ARGS... args) { + platform::dynload::vsExp(args...); + } }; template <> @@ -172,6 +177,11 @@ struct CBlas { static void VMUL(ARGS... args) { platform::dynload::vdMul(args...); } + + template + static void VEXP(ARGS... args) { + platform::dynload::vdExp(args...); + } }; #else @@ -230,6 +240,7 @@ struct CBlas { PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); } static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } + static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; #ifdef PADDLE_WITH_MKLML @@ -374,6 +385,19 @@ void Blas::VMUL(int n, const T *x, const T *y, #endif } +template <> +template +void Blas::VEXP(int n, const T *x, T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VEXP(n, x, y); +#else + // try to find if openblas support vexp + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +#endif +} + template <> template T Blas::DOT(int n, const T *x, const T *y) const { diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index e50ea6740a..aa20553cef 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -74,6 +74,8 @@ extern void* mklml_dso_handle; __macro(vdAdd); \ __macro(vsMul); \ __macro(vdMul); \ + __macro(vsExp); \ + __macro(vdExp); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); From 9affc36c89c2df4e26d00b1a081db0eabfd8e4fe Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 20 Aug 2018 21:15:39 +0800 Subject: [PATCH 06/21] init attention lstm --- paddle/fluid/operators/attention_lstm_op.cc | 354 ++++++++++++++++++++ paddle/fluid/operators/attention_lstm_op.h | 42 +++ 2 files changed, 396 insertions(+) create mode 100644 paddle/fluid/operators/attention_lstm_op.cc create mode 100644 paddle/fluid/operators/attention_lstm_op.h diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc new file mode 100644 index 0000000000..087df06ad5 --- /dev/null +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -0,0 +1,354 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/attention_lstm_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightX"), + "Input(WeightX) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightH"), + "Input(WeightH) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("XX"), + "Output(XX) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(Hidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), + "Output(BatchedGate) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), + "Output(BatchedGate) of LSTM should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + auto wx_dims = ctx->GetInputDim("WeightX"); + PADDLE_ENFORCE_EQ(wx_dims.size(), 2, + "The rank of Input(WeightX) should be 2."); + PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], + "The first dimension of Input(WeightX) " + "should be %d.", + x_dims[1]); + + int frame_size = wx_dims[1] / 4; + auto wh_dims = ctx->GetInputDim("WeightH"); + PADDLE_ENFORCE_EQ(wh_dims.size(), 2, + "The rank of Input(WeightH) should be 2."); + PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, + "The first dimension of Input(WeightH) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size, + "The second dimension of Input(WeightH) " + "should be 4 * %d.", + frame_size); + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + + PADDLE_ENFORCE(!ctx->Attrs().Get("use_peepholes"), + "Do not support peephole yet."); + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + + framework::DDim out_dims({x_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); + ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->ShareLoD("X", "Hidden"); + ctx->ShareLoD("X", "Cell"); + + int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->ShareLoD("X", "XX"); +} + +framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); +} + +void FusionLSTMOpMaker::Make() { + AddInput("X", + "(LoDTensor) the input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + AddInput("WeightX", + "(Tensor) the learnable weights of X." + " - The shape is (M x 4D), where M is the dim size of x, D is the " + "hidden size. " + " - Weight = {W_cx, W_ix, W_fx, W_ox}"); + AddInput("WeightH", + "(Tensor) same as LSTMOp, the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights. Almost same as LSTMOp" + "Note: we should add the fc bias into this (1x4D) in bias." + "input-hidden bias weight and peephole connections weight if " + "setting `use_peepholes` True. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddInput("H0", + "(Tensor, optional) (same as LSTMOp) the initial hidden state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) (same as LSTMOp) (the initial cell state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); + AddOutput("Hidden", + "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("XX", + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input.") + .AsIntermediate(); + AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate(); + AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).") + .AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr("gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Fusion Long-Short Term Memory (LSTM) Operator. +This operator fuse the X into LSTM, more details can refer to LSTM op. +)DOC"); +} + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + // TODO(TJ): check mem copy perf + row_shuffle(ctx, src, index_lod, dst, indexed_src); +} + +template +class FuisonLSTMKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* wx = ctx.Input("WeightX"); + auto* wh = ctx.Input("WeightH"); + auto* bias = ctx.Input("Bias"); + auto* hidden_t0 = ctx.Input("H0"); + auto* cell_t0 = ctx.Input("C0"); + + auto* xx = ctx.Output("XX"); + auto* batched_gate = ctx.Output("BatchedGate"); + auto* hidden_out = ctx.Output("Hidden"); + auto* cell_out = ctx.Output("Cell"); + bool is_reverse = ctx.Attr("is_reverse"); + + T* xx_data = xx->mutable_data(ctx.GetPlace()); + T* batched_gate_data = batched_gate->mutable_data(ctx.GetPlace()); + hidden_out->mutable_data(ctx.GetPlace()); + cell_out->mutable_data(ctx.GetPlace()); + + const T* x_data = x->data(); + const T* wx_data = wx->data(); + auto x_dims = x->dims(); + auto wx_dims = wx->dims(); + + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + if (x_dims[1] > wx_dims[1]) { + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + x_data, wx_data, xx_data, + bias->data()); + to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); + } else { + to_batch(dev_ctx, *x, xx, true, is_reverse); + batched_gate->set_lod(xx->lod()); + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + xx_data, wx_data, batched_gate_data, + bias->data()); + } + + int frame_size = static_cast(wx_dims[1] / 4); + framework::DDim out_dims({x_dims[0], frame_size}); + math::LstmMetaValue lstm_value; + // no peephole + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + lstm_value.prev_state_value = nullptr; + Tensor ordered_c0; + + framework::Vector order(batched_gate->lod()[2]); + + if (cell_t0) { + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(dev_ctx, *cell_t0, order, &ordered_c0, + true); + lstm_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + LoDTensor batch_hidden, batch_cell; + auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); + batch_hidden.mutable_data(out_dims, ctx.GetPlace()); + batch_cell.mutable_data(out_dims, ctx.GetPlace()); + batch_cell_pre_act->mutable_data(out_dims, ctx.GetPlace()); + + auto batch_starts = batched_gate->lod()[0]; + size_t max_seq_len = batch_starts.size() - 1; + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + + for (size_t n = 0; n < max_seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batched_gate->Slice(bstart, bend); + Tensor out_t = batch_hidden.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); + // TODO(TJ): use gemm directly + blas.MatMul(pre_hidden_t, false, *wh, false, static_cast(1.0), + &gate_t, static_cast(1.0)); + } else if (hidden_t0) { + // TODO(TJ): move h0 outside for + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + Tensor ordered_h0; + ReorderInitState(dev_ctx, *hidden_t0, order, + &ordered_h0, true); + // TODO(TJ): use gemm directly + blas.MatMul(ordered_h0, false, *wh, false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } + + lstm_value.gate_value = gate_t.data(); + lstm_value.output_value = out_t.data(); + lstm_value.state_value = cell_t.data(); + lstm_value.state_active_value = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute( + dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act, + cand_act); + lstm_value.prev_state_value = lstm_value.state_value; + } + + math::Batch2LoDTensorFunctor to_seq; + batch_hidden.set_lod(batched_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(dev_ctx, batch_hidden, hidden_out); + + batch_cell.set_lod(batched_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(dev_ctx, batch_cell, cell_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL( + fusion_lstm, + ops::FuisonLSTMKernel, + ops::FuisonLSTMKernel); diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h new file mode 100644 index 0000000000..39dc09b4d1 --- /dev/null +++ b/paddle/fluid/operators/attention_lstm_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +// #include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle From 508548f897028bb93847f33705a30c4765fe0181 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 00:17:23 +0800 Subject: [PATCH 07/21] implement attention lstm cpu forward --- paddle/fluid/operators/attention_lstm_op.cc | 466 ++++++++++++-------- paddle/fluid/operators/attention_lstm_op.h | 5 +- paddle/fluid/operators/fusion_lstm_op.h | 1 - 3 files changed, 278 insertions(+), 194 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 087df06ad5..178a1c19a9 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -20,10 +20,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/operators/math/cpu_vec.h" + namespace paddle { namespace operators { -void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { +void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("WeightX"), "Input(WeightX) of LSTM should not be null."); @@ -57,6 +59,9 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "should be the same."); } + // fc_out , shape (maxseqlen,1) + int max_seq_len = 0; + auto wx_dims = ctx->GetInputDim("WeightX"); PADDLE_ENFORCE_EQ(wx_dims.size(), 2, "The rank of Input(WeightX) should be 2."); @@ -103,241 +108,321 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ctx->ShareLoD("X", "XX"); } -framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( +framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); } -void FusionLSTMOpMaker::Make() { +void AttentionLSTMOpMaker::Make() { AddInput("X", "(LoDTensor) the input is a LodTensor, which support " "variable-time length input sequence. The underlying tensor in " "this LoDTensor is a matrix with shape (T X M), where T is the " "total time steps in this mini-batch, M is the dim size of x."); - AddInput("WeightX", - "(Tensor) the learnable weights of X." - " - The shape is (M x 4D), where M is the dim size of x, D is the " - "hidden size. " - " - Weight = {W_cx, W_ix, W_fx, W_ox}"); - AddInput("WeightH", - "(Tensor) same as LSTMOp, the learnable hidden-hidden weights." - " - The shape is (D x 4D), where D is the hidden size. " - " - Weight = {W_ch, W_ih, W_fh, W_oh}"); - AddInput("Bias", - "(Tensor) the learnable weights. Almost same as LSTMOp" - "Note: we should add the fc bias into this (1x4D) in bias." - "input-hidden bias weight and peephole connections weight if " - "setting `use_peepholes` True. " - "1. `use_peepholes = False` " - " - The shape is (1 x 4D). " - " - Bias = {b_c, b_i, b_f, b_o}." - "2. `use_peepholes = True` " - " - The shape is (1 x 7D). " - " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddInput("C0", + "(Tensor) LSTM C0" + "This is a tensor with shape (N x D), where N is the batch size, D " + "is the gate size." + "C0 is necessary because of attention."); AddInput("H0", - "(Tensor, optional) (same as LSTMOp) the initial hidden state is an " - "optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size and D is the hidden size.") + "(Tensor, optional) LSTM H0" + "This is a tensor with shape (N x D), where N is the " + "batch size and D is the gate size.") .AsDispensable(); - AddInput("C0", - "(Tensor, optional) (same as LSTMOp) (the initial cell state is an " - "optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size. `H0` and `C0` can be NULL but only at the same time.") + AddInput("AttentionWeight", + "(Tensor) the weights of attention fc. Always relu the fc result." + "The shape is ((M+D) x 1), where M is the dim size of x, D is the " + "gate size of LSTM."); + AddInput("AttentionBias, optional", + "(Tensor) the bias of attention fc." + "The shape is (1 x 1)") + .AsDispensable(); + AddInput("AttentionScalar", + "(Tensor, optional) the scalar on the result of attentioned fc. " + "Always relu the Scalar." + "The shape is (1 x 1)") + .AsDispensable(); + AddInput("AttentionScalarBias", + "(Tensor, optional) the scalar bias of attention fc." + "The shape is (1 x 1)") .AsDispensable(); + AddInput("LSTMWeight", + "(Tensor) the combined weight of LSTM" + " - The shape is ((D+M) x 4D), where D is the hidden gate size, M " + "is the dim size of x" + " - Weight = {W_forget, W_input, W_output, W_cell}"); + AddInput("LSTMBias", + "(Tensor) the combined bias of LSTM, shape (1x4D)." + "Note: we should add the bias of hidden and context accorindg to " + "the same gate: " + "{B_forget, B_input, B_output, B_cell}"); AddOutput("Hidden", "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " "The shape is (T x D), and lod is the same with the `Input`."); AddOutput("Cell", "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " "The shape is (T x D), and lod is the same with the `Input`."); - AddOutput("XX", - "(LoDTensor) the result after X * WeightX (size is T x 4D)" - " or batched_X (size is T x M), this will be automatically chosen," - " where T is the total time steps in this mini-batch," - " D is the hidden size, M is the dim size of x input.") + AddOutput( + "AttentionedX", + "(LodTensor) shape is (T x 1), the result after X * AttentionWeight," + " where T is the total time steps in this mini-batch," + " D is the hidden size.") .AsIntermediate(); - AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate(); - AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).") + AddOutput("AttentionFCOut", + "(Tensor) (max_seq_len, 1), compute at each step.") .AsIntermediate(); - AddAttr("use_peepholes", - "(bool, defalut: True) " - "whether to enable diagonal/peephole connections.") - .SetDefault(true); - AddAttr("is_reverse", - "(bool, defalut: False) " - "whether to compute reversed LSTM.") - .SetDefault(false); + AddOutput("LSTMX", + "(Tensor) the input X of LSTM for each step." + "Shape is (1 x M), where M is the x frame size") + .AsIntermediate(); + AddOutput( + "LSTMOUT", + "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step." + "Shape is (1 x 4D), where M is the x frame size") + .AsIntermediate(); + // TODO(TJ): InEnum({"sigmoid", "tanh", "relu", "identity"}); AddAttr("gate_activation", "(string, default: sigmoid)" "The activation for input gate, forget gate and output " "gate, `sigmoid` by default.") .SetDefault("sigmoid") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); + .InEnum({"sigmoid"}); AddAttr("cell_activation", "(string, default: tanh)" "The activation for cell output, `tanh` by defalut.") .SetDefault("tanh") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); + .InEnum({"tanh"}); AddAttr("candidate_activation", "(string, default: tanh)" "The activation for candidate hidden state, " "`tanh` by default.") .SetDefault("tanh") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); + .InEnum({"tanh"}); AddComment(R"DOC( -Fusion Long-Short Term Memory (LSTM) Operator. -This operator fuse the X into LSTM, more details can refer to LSTM op. +Attention Long-Short Term Memory (LSTM) Operator. + +Attention part: +concat( x(seqlen * M), expand( cell_t-1(1,D) ) ) => tmp(seqlen*(M+D)) + +tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu + +fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu + +dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) + +LSTM part: +use lstm_x_t as input and compute as standard LSTM. + )DOC"); } +// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0; +template +inline void bias_relu(const int n, const T* x, const T* bias, T* y) { + if (bias) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] + bias[0]; + } + vec_relu(n, y, y); + } else { + vec_relu(n, x, y); + } +} + template -inline void ReorderInitState(const DeviceContext& ctx, - const framework::Tensor& src, - framework::Vector index_lod, - framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; - dst->mutable_data(src.dims(), ctx.GetPlace()); - // TODO(TJ): check mem copy perf - row_shuffle(ctx, src, index_lod, dst, indexed_src); +inline void vec_softmax(const BlasT& blas, const int n, + const T* x, T* y) { + T scalar = x[0]; + // max + for (int i = 1; i < n; ++i) { + scalar = scalar < x[i] ? x[i] : scalar; + } + + // sub + for (int i = 0; i < n; ++i) { + y[c] = x[c] - alpha; + } + + // exp + blas.VEXP(n, y, y); + + // sum + scalar = T(0); + for (int i = 0; i < n; ++i) { + scalar += y[i]; + } + + // scale + blas.VSCAL(n, static_cast(1) / scalar, y); +} + +__m256 exp(__m256 a) { return exp256_ps(a); } + +__m256 log(__m256 a) { return log256_ps(a); } + +__m256 sin(__m256 a) { return sin256_ps(a); } + +__m256 cos(__m256 a) { return cos256_ps(a); } + +__m256 relu(const __m256 a) { + __m256 tmp = _mm256_set1_ps(0.0f); + return _mm256_max_ps(a, tmp); +} + +__m256 sigmoid(const __m256 a) { + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 tmp = _mm256_max_ps(a, min); + tmp = _mm256_min_ps(tmp, max); + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); + tmp = exp(tmp); + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); + return tmp; +} + +__m256 tanh(const __m256 a) { + __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); + __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); + tmp = _mm256_min_ps(tmp, max); + tmp = exp(tmp); + return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), + _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), + _mm256_set1_ps(1.0f)); +} + +__m256 linear(const __m256 a) { return a; } + +inline void vec_sigmoid(const T* x, T* y) { + const real min = SIGMOID_THRESHOLD_MIN; + const real max = SIGMOID_THRESHOLD_MAX; + real tmp = (a < min) ? min : ((a > max) ? max : a); + return 1.0 / (1.0 + exp(-tmp)); } template -class FuisonLSTMKernel : public framework::OpKernel { +class AttentionLSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* wx = ctx.Input("WeightX"); - auto* wh = ctx.Input("WeightH"); - auto* bias = ctx.Input("Bias"); - auto* hidden_t0 = ctx.Input("H0"); - auto* cell_t0 = ctx.Input("C0"); - - auto* xx = ctx.Output("XX"); - auto* batched_gate = ctx.Output("BatchedGate"); - auto* hidden_out = ctx.Output("Hidden"); - auto* cell_out = ctx.Output("Cell"); - bool is_reverse = ctx.Attr("is_reverse"); - - T* xx_data = xx->mutable_data(ctx.GetPlace()); - T* batched_gate_data = batched_gate->mutable_data(ctx.GetPlace()); - hidden_out->mutable_data(ctx.GetPlace()); - cell_out->mutable_data(ctx.GetPlace()); + auto* x = ctx.Input("X"); // T x M + auto* h0 = ctx.Input("H0"); // N x D + auto* c0 = ctx.Input("C0"); // N x D + auto* atten_w = ctx.Input("AttentionWeight"); // (M+D) x 1 + auto* atten_b = ctx.Input("AttentionBias"); // 1x1 + auto* atten_scalar = ctx.Input("AttentionScalar"); // 1x1 + auto* atten_scalar_bias = ctx.Input("AttentionScalar"); // 1x1 + auto* lstm_w = ctx.Input("LSTMWeight"); // (D+M) x D*4 + auto* lstm_b = ctx.Input("LSTMBias"); // 1 x D*4 + + auto* hidden_out = ctx.Output("Hidden"); // TxD + auto* cell_out = ctx.Output("Cell"); // TxD + auto* atted_x = ctx.Output("AttentionedX"); // T x 1 + auto* fc_out = ctx.Output('AttentionFCOut'); // max_seq_len x 1 + auto* lstm_x = ctx.Output("LSTMX"); // 1 x M + auto* lstm_out = ctx.Output("LSTMOUT"); // 1 x 4D const T* x_data = x->data(); - const T* wx_data = wx->data(); - auto x_dims = x->dims(); - auto wx_dims = wx->dims(); - - math::LoDTensor2BatchFunctor to_batch; - auto& dev_ctx = ctx.template device_context(); - auto blas = math::GetBlas(dev_ctx); - if (x_dims[1] > wx_dims[1]) { - math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], - x_data, wx_data, xx_data, - bias->data()); - to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); - } else { - to_batch(dev_ctx, *x, xx, true, is_reverse); - batched_gate->set_lod(xx->lod()); - math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], - xx_data, wx_data, batched_gate_data, - bias->data()); - } - - int frame_size = static_cast(wx_dims[1] / 4); - framework::DDim out_dims({x_dims[0], frame_size}); - math::LstmMetaValue lstm_value; - // no peephole - lstm_value.check_ig = nullptr; - lstm_value.check_fg = nullptr; - lstm_value.check_og = nullptr; - lstm_value.prev_state_value = nullptr; - Tensor ordered_c0; - - framework::Vector order(batched_gate->lod()[2]); - - if (cell_t0) { - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized cell state also needs - // to reorder. - ReorderInitState(dev_ctx, *cell_t0, order, &ordered_c0, - true); - lstm_value.prev_state_value = ordered_c0.data(); - } - - // Use the local variable as here. - LoDTensor batch_hidden, batch_cell; - auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); - batch_hidden.mutable_data(out_dims, ctx.GetPlace()); - batch_cell.mutable_data(out_dims, ctx.GetPlace()); - batch_cell_pre_act->mutable_data(out_dims, ctx.GetPlace()); - - auto batch_starts = batched_gate->lod()[0]; - size_t max_seq_len = batch_starts.size() - 1; - auto gate_act = math::detail::GetActivationType( - ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( - ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( - ctx.Attr("candidate_activation")); - - for (size_t n = 0; n < max_seq_len; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - Tensor gate_t = batched_gate->Slice(bstart, bend); - Tensor out_t = batch_hidden.Slice(bstart, bend); - Tensor cell_t = batch_cell.Slice(bstart, bend); - Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); - - int cur_batch_size = bend - bstart; - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); - // TODO(TJ): use gemm directly - blas.MatMul(pre_hidden_t, false, *wh, false, static_cast(1.0), - &gate_t, static_cast(1.0)); - } else if (hidden_t0) { - // TODO(TJ): move h0 outside for - // If n == 0 and there is no initialized hidden state, that is to say - // the H0 is zeros, the calculation W_h * H0 will be skiped. - // If n == 0 and there is initialized hidden state, calculate W_h * H0. - - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized hidden state also needs - // to reorder. - Tensor ordered_h0; - ReorderInitState(dev_ctx, *hidden_t0, order, - &ordered_h0, true); - // TODO(TJ): use gemm directly - blas.MatMul(ordered_h0, false, *wh, false, static_cast(1.0), &gate_t, - static_cast(1.0)); + const T* h0_data = h0->data(); + const T* c0_data = c0->data(); + const T* lstm_w_data = lstm_w->data(); + const T* lstm_b_data = lstm_b->data(); + const T* atten_w_data = atten_w->data(); + const T* atten_b_data = atten_b ? atten_b->data() : NULL; + const T* atten_scalar_data = atten_scalar ? atten_scalar->data() : NULL; + const T* atten_scalar_bias_data = + atten_scalar_bias ? atten_scalar_bias->data() : NULL; + + T* hidden_out_data = hidden_out->mutable_data(); + T* cell_out_data = cell_out->mutable_data(); + T* atted_x_data = atted_x->mutable_data(); + T* fc_out_data = fc_out->mutable_data(); + T* lstm_x_data = lstm_x->mutable_data(); + T* lstm_out_data = lstm_out->mutable_data(); + + auto x_lod = x->lod(); + auto x_dims = x->dims(); // T x M + auto w_dims = w->dims(); // (D+M) x 4D + const int M = x_dims[1]; // x frame size + const int D = w_dims[1] / 4; // gate frame size + const int D2 = D * 2; + const int D3 = D * 3; + const int D4 = w_dims[1]; + const int batch_size = x_lod[0].size() - 1; // assert lod.size() == 1 + + // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 + auto blas = math::GetBlas(ctx); + math::FCCompute(blas, T, 1, M, x_data, atten_w_data, + atted_x_data, atten_b_data); + + const T* cur_x_data = x_data; + const T* prev_cell_data = NULL; + const T* prev_hidden_data = NULL; + T* cur_cell_out_data = cell_out_data; + T* cur_hidden_out_data = hidden_out_data; + for (int i = 0; i < batch_size; ++i) { + int seq_len = x_lod[0][i + 1]; + prev_cell_data = c0_data + i * D; + prev_hidden_data = h0 ? h0_data + i * D : NULL; + + for (int step = 0; step < seq_len; ++step) { + /// compute attention vector + // prev_cell(1xD) * fc(D) rest part of atten_wgt + // T = cblas_dot(); + T prev_cell_bias = blas.VDOT(D, prev_cell_data, atten_w_data + M); + // add cell bias and relu + bias_relu(seq_len, atted_x_data, &prev_cell_bias, fc_out_data); + // fc2: scalar + if (atten_scalar_data) { + // x = a*x + blas.VSCAL(seq_len, atten_scalar_data, fc_out_data); + bias_relu(seq_len, fc_out_data, atten_scalar_bias_data, + fc_out_data); + } + vec_softmax(blas, seq_len, fc_out_data, fc_out_data); + // mul x(seq_len*M) and sum pool + math::FCCompute(blas, 1, M, seq_len, fc_out_data, + cur_x_data, lstm_x_data); + + /// compute LSTM step + // lstm weight : concat[forget , input , output , tilde] + // shape : (D + M) x (4 * D) + // fc inputX(1xM) * weightX(M*(4D)) => 1 x 4D + blas.MatMul(1, D4, M, lstm_x_data, lstm_w_data + D * D4, lstm_out_data); + if (prev_hidden_data) { + blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast(1), + prev_hidden_data, D, lstm_w_data, D4, static_cast(1), + lstm_out_data, D4); + } + // since input is 1xM, so can use add bias + blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data); + + // gate act: sigmoid + vec_sigmoid(D3, lstm_out_data, lstm_out_data); + // candicate act: tanh + vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3); + + // a = forget * prev_cell + blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data); + + // b = input * tilde + blas.VMUL(D, lstm_out_data + D, lstm_out + D3, lstm_out_data + D); + + // cell_out = a + b + blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data); + + // state act tanh(cell_out) * output_gate + vec_tanh(D, cur_cell_out_data, lstm_out_data); + blas.VMUL(D, lstm_out_data, lstm_out + D2, cur_hidden_out_data); + + prev_hidden_data = hidden_out + i * gate_size; + prev_cell_data = cur_cell_out_data; + cur_cell_out_data = cur_cell_out_data + D; + cur_hidden_out_data = cur_hidden_out_data + D; } - - lstm_value.gate_value = gate_t.data(); - lstm_value.output_value = out_t.data(); - lstm_value.state_value = cell_t.data(); - lstm_value.state_active_value = cell_pre_act_t.data(); - math::LstmUnitFunctor::compute( - dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act, - cand_act); - lstm_value.prev_state_value = lstm_value.state_value; + cur_x_data = cur_x_data + seq_len * M; } - - math::Batch2LoDTensorFunctor to_seq; - batch_hidden.set_lod(batched_gate->lod()); - // restore the output hidden in LoDTensor from the batch hidden - to_seq(dev_ctx, batch_hidden, hidden_out); - - batch_cell.set_lod(batched_gate->lod()); - // restore the output cell state in LoDTensor from the batch cell - to_seq(dev_ctx, batch_cell, cell_out); } }; @@ -345,10 +430,11 @@ class FuisonLSTMKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker, +REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp, + ops::AttentionLSTMOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OP_CPU_KERNEL( - fusion_lstm, - ops::FuisonLSTMKernel, - ops::FuisonLSTMKernel); + attention_lstm, + ops::AttentionLSTMKernel, + ops::AttentionLSTMKernel); diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h index 39dc09b4d1..6ede3a7f3c 100644 --- a/paddle/fluid/operators/attention_lstm_op.h +++ b/paddle/fluid/operators/attention_lstm_op.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -// #include #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -22,7 +21,7 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -class FusionLSTMOp : public framework::OperatorWithKernel { +class AttentionLSTMOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -33,7 +32,7 @@ class FusionLSTMOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override; }; -class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker { +class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override; }; diff --git a/paddle/fluid/operators/fusion_lstm_op.h b/paddle/fluid/operators/fusion_lstm_op.h index 39dc09b4d1..7f79601602 100644 --- a/paddle/fluid/operators/fusion_lstm_op.h +++ b/paddle/fluid/operators/fusion_lstm_op.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -// #include #include "paddle/fluid/framework/op_registry.h" namespace paddle { From 6ed20474d47a2577159a3799549c457e9f38f420 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 10:17:47 +0800 Subject: [PATCH 08/21] refine attention lstm infershape --- paddle/fluid/operators/attention_lstm_op.cc | 198 +++++++++++--------- 1 file changed, 111 insertions(+), 87 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 178a1c19a9..636deb04a1 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -26,86 +26,102 @@ namespace paddle { namespace operators { void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightX"), - "Input(WeightX) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightH"), - "Input(WeightH) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Bias"), - "Input(Bias) of LSTM should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("XX"), - "Output(XX) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(C0) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"), + "Input(LSTMWeight) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LSTMBias"), + "Input(LSTMBias) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"), + "Input(AttentionWeight) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of LSTM should not be null."); + "Output(Hidden) of AttentionLSTM should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Cell"), - "Output(Cell) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), - "Output(BatchedGate) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), - "Output(BatchedGate) of LSTM should not be null."); + "Output(Cell) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"), + "Output(AttentionedX) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"), + "Output(AttentionFCOut) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("LSTMX"), + "Output(LSTMX) of AttentionLSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"), + "Output(LSTMOUT) of AttentionLSTM should not be null."); auto x_dims = ctx->GetInputDim("X"); + const int M = x_dims[1]; PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + auto w_dims = ctx->GetInputDim("LSTMWeight"); + const int D = w_dims[1] / 4; + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], D + M, + "LSTMWeight dims should be (%d + %d) * %d.", D + M, 4 * D); + + auto b_dims = ctx->GetInputDim("LSTMBias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x (%d + %d).", M, + D); + PADDLE_ENFORCE_EQ(b_dims[1], M + D, "LSTMBias dims should be 1 x (%d + %d).", + M, D); + + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2."); + PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D); if (ctx->HasInput("H0")) { - PADDLE_ENFORCE(ctx->HasInput("C0"), - "Input(Cell) and Input(Hidden) of LSTM should not " - "be null at the same time."); auto h_dims = ctx->GetInputDim("H0"); - auto c_dims = ctx->GetInputDim("C0"); PADDLE_ENFORCE(h_dims == c_dims, "The dimension of Input(H0) and Input(C0) " "should be the same."); } - // fc_out , shape (maxseqlen,1) - int max_seq_len = 0; - - auto wx_dims = ctx->GetInputDim("WeightX"); - PADDLE_ENFORCE_EQ(wx_dims.size(), 2, - "The rank of Input(WeightX) should be 2."); - PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], - "The first dimension of Input(WeightX) " - "should be %d.", - x_dims[1]); - - int frame_size = wx_dims[1] / 4; - auto wh_dims = ctx->GetInputDim("WeightH"); - PADDLE_ENFORCE_EQ(wh_dims.size(), 2, - "The rank of Input(WeightH) should be 2."); - PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, - "The first dimension of Input(WeightH) " - "should be %d.", - frame_size); - PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size, - "The second dimension of Input(WeightH) " - "should be 4 * %d.", - frame_size); - - auto b_dims = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); - PADDLE_ENFORCE_EQ(b_dims[0], 1, - "The first dimension of Input(Bias) should be 1."); - - PADDLE_ENFORCE(!ctx->Attrs().Get("use_peepholes"), - "Do not support peephole yet."); - PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, - "The second dimension of Input(Bias) should be " - "4 * %d if disable peepholes connection", - frame_size); - - framework::DDim out_dims({x_dims[0], frame_size}); + auto atten_w_dims = ctx->GetInputDim("AttentionWeight"); + PADDLE_ENFORCE_EQ(atten_w_dims.size(), 2, + "Input(AttentionWeight)'s rank must be 2."); + PADDLE_ENFORCE_EQ(atten_w_dims[0], M + D, + "AttentionWeight shapes must be (%d + %d) * 1.", M, D); + PADDLE_ENFORCE_EQ(atten_w_dims[1], 1, + "AttentionWeight shapes must be (%d + %d) * 1.", M, D); + if (ctx->HasInput("AttentionBias")) { + auto atten_b_dims = ctx->GetInputDim("AttentionBias"); + PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2, + "Input(AttentionBias)'s rank must be 2."); + PADDLE_ENFORCE_EQ(atten_b_dims[0], 1, + "AttentionBias shapes must be 1 * 1."); + PADDLE_ENFORCE_EQ(atten_b_dims[1], 1, + "AttentionBias shapes must be 1 * 1."); + } + + if (ctx->HasInput("AttentionScalar")) { + auto dims = ctx->GetInputDim("AttentionScalar"); + PADDLE_ENFORCE_EQ(dims.size(), 2, + "Input(AttentionScalar)'s rank must be 2."); + PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalar shapes must be 1 * 1."); + PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1."); + } + + if (ctx->HasInput("AttentionScalarBias")) { + auto dims = ctx->GetInputDim("AttentionScalarBias"); + PADDLE_ENFORCE( + ctx->HasInput("AttentionScalar"), + "AttentionScalar should not be null when have AttentionScalarBias."); + PADDLE_ENFORCE_EQ(dims.size(), 2, + "Input(AttentionScalarBias)'s rank must be 2."); + PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalarBias shapes must be 1 * 1."); + PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalarBias shapes must be 1 * 1."); + } + + framework::DDim out_dims({x_dims[0], D}); ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Cell", out_dims); - ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); - ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->SetOutputDim("AttentionedX", {x_dims[0], 1}); + ctx->SetOutputDim("LSTMX", {1, M}); + ctx->SetOutputDim("LSTMOUT", {1, 4 * D}); + // AttentionFCOut should be reshape as (maxseqlen,1) in runtime ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Cell"); - - int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; - ctx->SetOutputDim("XX", {x_dims[0], xx_width}); - ctx->ShareLoD("X", "XX"); } framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType( @@ -164,11 +180,10 @@ void AttentionLSTMOpMaker::Make() { AddOutput("Cell", "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " "The shape is (T x D), and lod is the same with the `Input`."); - AddOutput( - "AttentionedX", - "(LodTensor) shape is (T x 1), the result after X * AttentionWeight," - " where T is the total time steps in this mini-batch," - " D is the hidden size.") + AddOutput("AttentionedX", + "(Tensor) shape is (T x 1), the result after X * AttentionWeight," + " where T is the total time steps in this mini-batch," + " D is the hidden size.") .AsIntermediate(); AddOutput("AttentionFCOut", "(Tensor) (max_seq_len, 1), compute at each step.") @@ -316,12 +331,31 @@ class AttentionLSTMKernel : public framework::OpKernel { auto* lstm_w = ctx.Input("LSTMWeight"); // (D+M) x D*4 auto* lstm_b = ctx.Input("LSTMBias"); // 1 x D*4 - auto* hidden_out = ctx.Output("Hidden"); // TxD - auto* cell_out = ctx.Output("Cell"); // TxD - auto* atted_x = ctx.Output("AttentionedX"); // T x 1 - auto* fc_out = ctx.Output('AttentionFCOut'); // max_seq_len x 1 - auto* lstm_x = ctx.Output("LSTMX"); // 1 x M - auto* lstm_out = ctx.Output("LSTMOUT"); // 1 x 4D + auto* hidden_out = ctx.Output("Hidden"); // TxD + auto* cell_out = ctx.Output("Cell"); // TxD + auto* atted_x = ctx.Output("AttentionedX"); // T x 1 + auto* fc_out = ctx.Output('AttentionFCOut'); // max_seq_len x 1 + auto* lstm_x = ctx.Output("LSTMX"); // 1 x M + auto* lstm_out = ctx.Output("LSTMOUT"); // 1 x 4D + + // some shape should be reshape here since infershape can not get lod info + auto x_lod = x->lod(); + const int N = x_lod[0].size() - 1; // batch size + auto x_dims = x->dims(); // T x M + auto w_dims = w->dims(); // (D+M) x 4D + const int M = x_dims[1]; // x frame size + const int D = w_dims[1] / 4; // gate frame size + const int D2 = D * 2; + const int D3 = D * 3; + const int D4 = w_dims[1]; + int max_seq_len = x_lod[0][1]; + for (int i = 1; i < N; ++i) { + int len = x_lod[0][i + 1] - x_lod[0][i]; + max_seq_len = max_seq_len < len ? len : max_seq_len; + } + PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1."); + PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D); + fc_out->Resize({max_seq_len, 1}); const T* x_data = x->data(); const T* h0_data = h0->data(); @@ -341,16 +375,6 @@ class AttentionLSTMKernel : public framework::OpKernel { T* lstm_x_data = lstm_x->mutable_data(); T* lstm_out_data = lstm_out->mutable_data(); - auto x_lod = x->lod(); - auto x_dims = x->dims(); // T x M - auto w_dims = w->dims(); // (D+M) x 4D - const int M = x_dims[1]; // x frame size - const int D = w_dims[1] / 4; // gate frame size - const int D2 = D * 2; - const int D3 = D * 3; - const int D4 = w_dims[1]; - const int batch_size = x_lod[0].size() - 1; // assert lod.size() == 1 - // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 auto blas = math::GetBlas(ctx); math::FCCompute(blas, T, 1, M, x_data, atten_w_data, @@ -361,7 +385,7 @@ class AttentionLSTMKernel : public framework::OpKernel { const T* prev_hidden_data = NULL; T* cur_cell_out_data = cell_out_data; T* cur_hidden_out_data = hidden_out_data; - for (int i = 0; i < batch_size; ++i) { + for (int i = 0; i < N; ++i) { int seq_len = x_lod[0][i + 1]; prev_cell_data = c0_data + i * D; prev_hidden_data = h0 ? h0_data + i * D : NULL; @@ -370,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel { /// compute attention vector // prev_cell(1xD) * fc(D) rest part of atten_wgt // T = cblas_dot(); - T prev_cell_bias = blas.VDOT(D, prev_cell_data, atten_w_data + M); + T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M); // add cell bias and relu bias_relu(seq_len, atted_x_data, &prev_cell_bias, fc_out_data); // fc2: scalar if (atten_scalar_data) { // x = a*x - blas.VSCAL(seq_len, atten_scalar_data, fc_out_data); + blas.SCAL(seq_len, atten_scalar_data, fc_out_data); bias_relu(seq_len, fc_out_data, atten_scalar_bias_data, fc_out_data); } From cf5ea925c3eea2f63b099513b85eaf5032db38fa Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 16:10:55 +0800 Subject: [PATCH 09/21] fix bugs --- paddle/fluid/operators/attention_lstm_op.cc | 123 +++++++++----------- paddle/fluid/operators/math/blas.h | 17 ++- paddle/fluid/operators/math/blas_impl.h | 3 +- 3 files changed, 75 insertions(+), 68 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 636deb04a1..87fda12ea6 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -15,12 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/fc_compute.h" -#include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" - -#include "paddle/fluid/operators/math/cpu_vec.h" +// #include "paddle/fluid/operators/math/detail/activation_functions.h" +// #include "paddle/fluid/operators/math/cpu_vec.h" namespace paddle { namespace operators { @@ -233,6 +230,13 @@ use lstm_x_t as input and compute as standard LSTM. )DOC"); } +template +inline void vec_relu(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + // y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0; template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { @@ -240,14 +244,14 @@ inline void bias_relu(const int n, const T* x, const T* bias, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] + bias[0]; } - vec_relu(n, y, y); + vec_relu(n, y, y); } else { - vec_relu(n, x, y); + vec_relu(n, x, y); } } template -inline void vec_softmax(const BlasT& blas, const int n, +inline void vec_softmax(const math::BlasT& blas, const int n, const T* x, T* y) { T scalar = x[0]; // max @@ -257,7 +261,7 @@ inline void vec_softmax(const BlasT& blas, const int n, // sub for (int i = 0; i < n; ++i) { - y[c] = x[c] - alpha; + y[i] = x[i] - scalar; } // exp @@ -270,57 +274,45 @@ inline void vec_softmax(const BlasT& blas, const int n, } // scale - blas.VSCAL(n, static_cast(1) / scalar, y); + blas.SCAL(n, static_cast(1) / scalar, y); } -__m256 exp(__m256 a) { return exp256_ps(a); } +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 -__m256 log(__m256 a) { return log256_ps(a); } - -__m256 sin(__m256 a) { return sin256_ps(a); } - -__m256 cos(__m256 a) { return cos256_ps(a); } - -__m256 relu(const __m256 a) { - __m256 tmp = _mm256_set1_ps(0.0f); - return _mm256_max_ps(a, tmp); +template +inline T sigmoid(T x) { + return 1. / (1. + exp(-x)); } -__m256 sigmoid(const __m256 a) { - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); - __m256 tmp = _mm256_max_ps(a, min); - tmp = _mm256_min_ps(tmp, max); - tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); - tmp = exp(tmp); - tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); - tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); - return tmp; +template +inline T tanh(T x) { + return 2. * sigmoid(2. * x) - 1.; } -__m256 tanh(const __m256 a) { - __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); - __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); - tmp = _mm256_min_ps(tmp, max); - tmp = exp(tmp); - return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), - _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), - _mm256_set1_ps(1.0f)); +template +inline void vec_sigmoid(const int n, const T* x, T* y) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = 1.0 / (1.0 + std::exp(-tmp)); + } } -__m256 linear(const __m256 a) { return a; } - -inline void vec_sigmoid(const T* x, T* y) { - const real min = SIGMOID_THRESHOLD_MIN; - const real max = SIGMOID_THRESHOLD_MAX; - real tmp = (a < min) ? min : ((a > max) ? max : a); - return 1.0 / (1.0 + exp(-tmp)); +template +inline void vec_tanh(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = tanh(x[i]); + } } -template +template class AttentionLSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using DeviceContext = paddle::platform::CPUDeviceContext; auto* x = ctx.Input("X"); // T x M auto* h0 = ctx.Input("H0"); // N x D auto* c0 = ctx.Input("C0"); // N x D @@ -334,7 +326,7 @@ class AttentionLSTMKernel : public framework::OpKernel { auto* hidden_out = ctx.Output("Hidden"); // TxD auto* cell_out = ctx.Output("Cell"); // TxD auto* atted_x = ctx.Output("AttentionedX"); // T x 1 - auto* fc_out = ctx.Output('AttentionFCOut'); // max_seq_len x 1 + auto* fc_out = ctx.Output("AttentionFCOut"); // max_seq_len x 1 auto* lstm_x = ctx.Output("LSTMX"); // 1 x M auto* lstm_out = ctx.Output("LSTMOUT"); // 1 x 4D @@ -342,9 +334,10 @@ class AttentionLSTMKernel : public framework::OpKernel { auto x_lod = x->lod(); const int N = x_lod[0].size() - 1; // batch size auto x_dims = x->dims(); // T x M - auto w_dims = w->dims(); // (D+M) x 4D - const int M = x_dims[1]; // x frame size - const int D = w_dims[1] / 4; // gate frame size + auto w_dims = lstm_w->dims(); // (D+M) x 4D + const int total_T = x_dims[0]; + const int M = x_dims[1]; // x frame size + const int D = w_dims[1] / 4; // gate frame size const int D2 = D * 2; const int D3 = D * 3; const int D4 = w_dims[1]; @@ -357,6 +350,8 @@ class AttentionLSTMKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D); fc_out->Resize({max_seq_len, 1}); + // TODO(TJ): act functor init here + const T* x_data = x->data(); const T* h0_data = h0->data(); const T* c0_data = c0->data(); @@ -368,16 +363,16 @@ class AttentionLSTMKernel : public framework::OpKernel { const T* atten_scalar_bias_data = atten_scalar_bias ? atten_scalar_bias->data() : NULL; - T* hidden_out_data = hidden_out->mutable_data(); - T* cell_out_data = cell_out->mutable_data(); - T* atted_x_data = atted_x->mutable_data(); - T* fc_out_data = fc_out->mutable_data(); - T* lstm_x_data = lstm_x->mutable_data(); - T* lstm_out_data = lstm_out->mutable_data(); + T* hidden_out_data = hidden_out->mutable_data(ctx.GetPlace()); + T* cell_out_data = cell_out->mutable_data(ctx.GetPlace()); + T* atted_x_data = atted_x->mutable_data(ctx.GetPlace()); + T* fc_out_data = fc_out->mutable_data(ctx.GetPlace()); + T* lstm_x_data = lstm_x->mutable_data(ctx.GetPlace()); + T* lstm_out_data = lstm_out->mutable_data(ctx.GetPlace()); // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 auto blas = math::GetBlas(ctx); - math::FCCompute(blas, T, 1, M, x_data, atten_w_data, + math::FCCompute(blas, total_T, 1, M, x_data, atten_w_data, atted_x_data, atten_b_data); const T* cur_x_data = x_data; @@ -400,7 +395,7 @@ class AttentionLSTMKernel : public framework::OpKernel { // fc2: scalar if (atten_scalar_data) { // x = a*x - blas.SCAL(seq_len, atten_scalar_data, fc_out_data); + blas.SCAL(seq_len, *atten_scalar_data, fc_out_data); bias_relu(seq_len, fc_out_data, atten_scalar_bias_data, fc_out_data); } @@ -431,16 +426,16 @@ class AttentionLSTMKernel : public framework::OpKernel { blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data); // b = input * tilde - blas.VMUL(D, lstm_out_data + D, lstm_out + D3, lstm_out_data + D); + blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D); // cell_out = a + b blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data); // state act tanh(cell_out) * output_gate vec_tanh(D, cur_cell_out_data, lstm_out_data); - blas.VMUL(D, lstm_out_data, lstm_out + D2, cur_hidden_out_data); + blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data); - prev_hidden_data = hidden_out + i * gate_size; + prev_hidden_data = cur_hidden_out_data; prev_cell_data = cur_cell_out_data; cur_cell_out_data = cur_cell_out_data + D; cur_hidden_out_data = cur_hidden_out_data + D; @@ -458,7 +453,5 @@ REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp, ops::AttentionLSTMOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OP_CPU_KERNEL( - attention_lstm, - ops::AttentionLSTMKernel, - ops::AttentionLSTMKernel); +REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel, + ops::AttentionLSTMKernel); diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 5aba170221..da185d93c0 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -160,7 +160,7 @@ class Blas { T DOT(int n, const T* x, const T* y) const; template - void SCAL(int n, const T a, const T* x) const; + void SCAL(int n, const T a, T* x) const; template void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, @@ -233,11 +233,26 @@ class BlasT : private Blas { Base()->template VCOPY(args...); } + template + void VEXP(ARGS... args) const { + Base()->template VEXP(args...); + } + template void GEMV(ARGS... args) const { Base()->template GEMV(args...); } + template + T DOT(ARGS... args) const { + return Base()->template DOT(args...); + } + + template + void SCAL(ARGS... args) const { + Base()->template SCAL(args...); + } + template void BatchedGEMM(ARGS... args) const { Base()->template BatchedGEMM(args...); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index eaad83ba18..e1df78d11e 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -415,8 +415,7 @@ T Blas::DOT(int n, const T *x, const T *y) const { template <> template -void Blas::SCAL(int n, const T a, - const T *x) const { +void Blas::SCAL(int n, const T a, T *x) const { #ifdef PADDLE_WITH_MKLML CBlas::SCAL(n, a, x, 1); #else From ec59f0d454569ef536c9ac0f7224bc7062b110ce Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 16:40:37 +0800 Subject: [PATCH 10/21] add cpu vec --- paddle/fluid/operators/attention_lstm_op.cc | 56 +++----------- paddle/fluid/operators/math/cpu_vec.h | 81 +++++++++++++++++++++ paddle/fluid/platform/cpu_info.cc | 2 + paddle/fluid/platform/cpu_info.h | 1 + 4 files changed, 95 insertions(+), 45 deletions(-) create mode 100644 paddle/fluid/operators/math/cpu_vec.h diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 87fda12ea6..14985a3f74 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" -// #include "paddle/fluid/operators/math/detail/activation_functions.h" -// #include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -230,13 +230,6 @@ use lstm_x_t as input and compute as standard LSTM. )DOC"); } -template -inline void vec_relu(const int n, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - // y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0; template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { @@ -244,9 +237,9 @@ inline void bias_relu(const int n, const T* x, const T* bias, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] + bias[0]; } - vec_relu(n, y, y); + math::vec_relu(n, y, y); } else { - vec_relu(n, x, y); + math::vec_relu(n, x, y); } } @@ -277,37 +270,6 @@ inline void vec_softmax(const math::BlasT& blas, const int n, blas.SCAL(n, static_cast(1) / scalar, y); } -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - -template -inline T sigmoid(T x) { - return 1. / (1. + exp(-x)); -} - -template -inline T tanh(T x) { - return 2. * sigmoid(2. * x) - 1.; -} - -template -inline void vec_sigmoid(const int n, const T* x, T* y) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = 1.0 / (1.0 + std::exp(-tmp)); - } -} - -template -inline void vec_tanh(const int n, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = tanh(x[i]); - } -} - template class AttentionLSTMKernel : public framework::OpKernel { public: @@ -351,6 +313,10 @@ class AttentionLSTMKernel : public framework::OpKernel { fc_out->Resize({max_seq_len, 1}); // TODO(TJ): act functor init here + // if (platform::jit::MayIUse(platform::jit::avx2)) { + // } else if (platform::jit::MayIUse(platform::jit::avx)) { + // } else { + // } const T* x_data = x->data(); const T* h0_data = h0->data(); @@ -418,9 +384,9 @@ class AttentionLSTMKernel : public framework::OpKernel { blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data); // gate act: sigmoid - vec_sigmoid(D3, lstm_out_data, lstm_out_data); + math::vec_sigmoid(D3, lstm_out_data, lstm_out_data); // candicate act: tanh - vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3); + math::vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3); // a = forget * prev_cell blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data); @@ -432,7 +398,7 @@ class AttentionLSTMKernel : public framework::OpKernel { blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data); // state act tanh(cell_out) * output_gate - vec_tanh(D, cur_cell_out_data, lstm_out_data); + math::vec_tanh(D, cur_cell_out_data, lstm_out_data); blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data); prev_hidden_data = cur_hidden_out_data; diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h new file mode 100644 index 0000000000..29476fce70 --- /dev/null +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace math { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +template +inline T sigmoid(T x) { + return 1. / (1. + exp(-x)); +} + +template +inline T tanh(T x) { + return 2. * sigmoid(2. * x) - 1.; +} + +template +inline void vec_sigmoid(const int n, const T* x, T* y) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = 1.0 / (1.0 + std::exp(-tmp)); + } +} + +template +inline void vec_tanh(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = tanh(x[i]); + } +} + +template +inline void vec_relu(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template <> +inline void vec_relu(const int n, const float* x, + float* y) { + // TODO(TJ): complete me + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template <> +inline void vec_relu(const int n, const float* x, + float* y) { + // TODO(TJ): complete me + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 7d53a684d6..79a924434b 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -112,6 +112,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) { switch (cpu_isa) { case sse42: return cpu.has(Cpu::tSSE42); + case avx: + return cpu.has(Cpu::tAVX); case avx2: return cpu.has(Cpu::tAVX2); case avx512_common: diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index f5f6766759..2baa21c1bd 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -43,6 +43,7 @@ namespace jit { typedef enum { isa_any, sse42, + avx, avx2, avx512_common, avx512_core, From 93cc29abc09ac3d9cc85d4490f878da46431cdda Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 16:54:57 +0800 Subject: [PATCH 11/21] init attention lstm op test --- .../tests/unittests/test_attention_lstm_op.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_attention_lstm_op.py diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py new file mode 100644 index 0000000000..cd555a022b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py @@ -0,0 +1,149 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from test_fusion_lstm_op import fc, ACTIVATION + + +def attention_lstm( + x, # T x M + lod, # 1 x N + h0, # N x D + c0, # N x D + fcws, # (M+D) x 1, 1x1 + fcbs, # 1 x 1, 1x1 + w, # (M+D) x 4D + b, # 1 x 4D + act_gate, + act_cell, + act_cand): + hidden + cell + return hidden, cell + + +class TestAttentionLSTMOp(OpTest): + def set_conf(self): + self.lod = [[3]] + + def setUp(self): + self.op_type = 'attention_lstm' + self.lod = [[3]] + self.M = 30 + self.D = 15 + self.has_initial_hidden = True + self.act_gate = 'sigmoid' + self.act_cell = 'tanh' + self.act_cand = 'tanh' + self.set_conf() + + T = sum(self.lod[0]) + bs = len(self.lod[0]) + + x = np.random.normal(size=(T, self.M)).astype('float32') + c0 = np.random.normal(size=(bs, self.D)).astype('float32') + if self.has_initial_hidden: + h0 = np.random.normal(size=(bs, self.D)).astype('float32') + else: + h0 = np.zeros((bs, self.D)).astype('float32') + + fcw1 = np.random.normal(size=(self.M + self.D, 1)).astype('float32') + fcb1 = np.random.normal(size=(1, 1)).astype('float32') + fcw2 = np.random.normal(size=(1, 1)).astype('float32') + fcb2 = np.random.normal(size=(1, 1)).astype('float32') + + # lstm weight and bias + w = np.random.normal(size=(self.M + self.D, + self.D * 4)).astype('float32') + b = np.random.normal(size=(1, self.D * 4)).astype('float32') + + h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2], + ACTIVATION[self.act_gate], + ACTIVATION[self.act_cell], + ACTIVATION[self.act_cand]) + + self.inputs = { + 'X': (x, self.lod), + 'C0': c0, + 'AttentionWeight': fcw1, + 'AttentionBias': fcb1, + 'AttentionScalar': fcw2, + 'AttentionScalarBias': fcb2, + 'LSTMWeight': w, + 'LSTMBias': b + } + + if self.has_initial_hidden: + self.inputs['H0'] = h0 + + self.outputs = { + 'Hidden': (h, self.lod), + 'Cell': (c, self.lod), + } + self.attrs = { + 'gate_activation': self.act_gate, + 'cell_activation': self.act_cell, + 'candidate_activation': self.act_cand + } + + def test_check_output(self): + self.check_output() + + +class TestAttentionOpNonInit(TestAttentionLSTMOp): + def set_conf(self): + self.has_initial_hidden = False + + +class TestAttentionOpMD1(TestAttentionLSTMOp): + def set_conf(self): + self.M = 36 + self.D = 8 + + +class TestAttentionOpMD2(TestAttentionLSTMOp): + def set_conf(self): + self.M = 8 + self.D = 8 + + +class TestAttentionOpMD3(TestAttentionLSTMOp): + def set_conf(self): + self.M = 15 + self.D = 30 + + +class TestAttentionOpBS1(TestAttentionLSTMOp): + def set_conf(self): + self.lod = [[5]] + self.M = 16 + self.D = 32 + + +class TestAttentionOpBS2(TestAttentionLSTMOp): + def set_conf(self): + self.lod = [[3, 6]] + + +class TestAttentionOpBS5(TestAttentionLSTMOp): + def set_conf(self): + self.lod = [[3, 2, 4, 7, 5]] + + +if __name__ == '__main__': + unittest.main() From 522b3e411f33400ae2735e81c4bc65ca26438445 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 19:40:59 +0800 Subject: [PATCH 12/21] complete attention lstm op test --- .../tests/unittests/test_attention_lstm_op.py | 55 ++++++++++++++++++- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py index cd555a022b..dea6ec7668 100644 --- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py @@ -18,6 +18,7 @@ import unittest import numpy as np from op_test import OpTest from test_fusion_lstm_op import fc, ACTIVATION +from test_softmax_op import stable_softmax def attention_lstm( @@ -32,8 +33,56 @@ def attention_lstm( act_gate, act_cell, act_cand): - hidden - cell + + T = sum(lod[0]) + N = len(lod[0]) + M = x.shape[1] + D = b.shape[1] / 4 + assert T == x.shape[0] + assert len(fcws) == len(fcbs) + + hidden = [] + cell = [] + + start_offset = 0 + for bid in range(N): + seq_len = lod[0][bid] + xi = np.copy(x[start_offset:seq_len, :]).reshape(seq_len, M) + prev_cell = np.copy(c0[bid]).reshape([1, D]) + prev_hidden = np.copy(h0[bid]).reshape([1, D]) + for step in range(seq_len): + expanded_cell = np.repeat(prev_cell, seq_len, axis=0) + tmp = np.concatenate((xi, expanded_cell), axis=1) + assert tmp.shape[1] == M + D + for fcid in range(len(fcbs)): + tmp = fc(tmp, fcws[fcid], fcbs[fcid]) + tmp = ACTIVATION['relu'](tmp) + tmp = np.reshape(tmp, (1, seq_len)) + tmp = stable_softmax(tmp).reshape(seq_len, 1) + lstmx = xi * tmp # seq * M + lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M]) + lstmin = np.concatenate((prev_hidden, lstmx), axis=1) + lstmout = np.dot(lstmin, w).reshape([1, 4 * D]) + + g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1) + g_f = act_gate(g_f).reshape([1, D]) + g_i = act_gate(g_i).reshape([1, D]) + g_o = act_gate(g_o).reshape([1, D]) + cand = act_cand(cand).reshape([1, D]) + + cell_t = (prev_cell * g_f) + (g_i * cand) + hidden_t = g_o * act_cell(cell_t) + + hidden.append(hidden_t.flatten()) + cell.append(cell_t.flatten()) + + prev_cell = cell_t.reshape([1, D]) + prev_hidden = hidden_t.reshape([1, D]) + + start_offset += seq_len + + hidden = np.array(hidden).astype('float32').reshape([T, D]) + cell = np.array(cell).astype('float32').reshape([T, D]) return hidden, cell @@ -73,7 +122,7 @@ class TestAttentionLSTMOp(OpTest): b = np.random.normal(size=(1, self.D * 4)).astype('float32') h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2], - ACTIVATION[self.act_gate], + w, b, ACTIVATION[self.act_gate], ACTIVATION[self.act_cell], ACTIVATION[self.act_cand]) From dd938d0b948cca5f968411704a023efc8b2971f4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 22 Aug 2018 22:00:45 +0800 Subject: [PATCH 13/21] fix bugs and pass op test --- paddle/fluid/operators/attention_lstm_op.cc | 36 +++++++++---------- .../tests/unittests/test_attention_lstm_op.py | 9 ++--- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 14985a3f74..5d57703c0b 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -59,10 +59,8 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { auto b_dims = ctx->GetInputDim("LSTMBias"); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2."); - PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x (%d + %d).", M, - D); - PADDLE_ENFORCE_EQ(b_dims[1], M + D, "LSTMBias dims should be 1 x (%d + %d).", - M, D); + PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x %d.", 4 * D); + PADDLE_ENFORCE_EQ(b_dims[1], 4 * D, "LSTMBias dims should be 1 x %d.", 4 * D); auto c_dims = ctx->GetInputDim("C0"); PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2."); @@ -148,8 +146,8 @@ void AttentionLSTMOpMaker::Make() { "(Tensor) the weights of attention fc. Always relu the fc result." "The shape is ((M+D) x 1), where M is the dim size of x, D is the " "gate size of LSTM."); - AddInput("AttentionBias, optional", - "(Tensor) the bias of attention fc." + AddInput("AttentionBias", + "(Tensor, optional) the bias of attention fc." "The shape is (1 x 1)") .AsDispensable(); AddInput("AttentionScalar", @@ -281,7 +279,7 @@ class AttentionLSTMKernel : public framework::OpKernel { auto* atten_w = ctx.Input("AttentionWeight"); // (M+D) x 1 auto* atten_b = ctx.Input("AttentionBias"); // 1x1 auto* atten_scalar = ctx.Input("AttentionScalar"); // 1x1 - auto* atten_scalar_bias = ctx.Input("AttentionScalar"); // 1x1 + auto* atten_scalar_bias = ctx.Input("AttentionScalarBias"); // 1x1 auto* lstm_w = ctx.Input("LSTMWeight"); // (D+M) x D*4 auto* lstm_b = ctx.Input("LSTMBias"); // 1 x D*4 @@ -319,7 +317,7 @@ class AttentionLSTMKernel : public framework::OpKernel { // } const T* x_data = x->data(); - const T* h0_data = h0->data(); + const T* h0_data = h0 ? h0->data() : NULL; const T* c0_data = c0->data(); const T* lstm_w_data = lstm_w->data(); const T* lstm_b_data = lstm_b->data(); @@ -341,36 +339,35 @@ class AttentionLSTMKernel : public framework::OpKernel { math::FCCompute(blas, total_T, 1, M, x_data, atten_w_data, atted_x_data, atten_b_data); + const T* cur_atten_x_data = atted_x_data; const T* cur_x_data = x_data; const T* prev_cell_data = NULL; const T* prev_hidden_data = NULL; T* cur_cell_out_data = cell_out_data; T* cur_hidden_out_data = hidden_out_data; for (int i = 0; i < N; ++i) { - int seq_len = x_lod[0][i + 1]; + int seq_len = x_lod[0][i + 1] - x_lod[0][i]; prev_cell_data = c0_data + i * D; - prev_hidden_data = h0 ? h0_data + i * D : NULL; - + prev_hidden_data = h0_data ? h0_data + i * D : NULL; for (int step = 0; step < seq_len; ++step) { - /// compute attention vector - // prev_cell(1xD) * fc(D) rest part of atten_wgt - // T = cblas_dot(); + /// 1. compute attention vector + // 1a. prev_cell(1xD) * fc(D) rest part of atten_wgt T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M); - // add cell bias and relu - bias_relu(seq_len, atted_x_data, &prev_cell_bias, fc_out_data); - // fc2: scalar + // 1b. add cell bias and relu + bias_relu(seq_len, cur_atten_x_data, &prev_cell_bias, fc_out_data); + // 1c. fc scalar if (atten_scalar_data) { - // x = a*x blas.SCAL(seq_len, *atten_scalar_data, fc_out_data); bias_relu(seq_len, fc_out_data, atten_scalar_bias_data, fc_out_data); } + // 1d. softmax vec_softmax(blas, seq_len, fc_out_data, fc_out_data); // mul x(seq_len*M) and sum pool math::FCCompute(blas, 1, M, seq_len, fc_out_data, cur_x_data, lstm_x_data); - /// compute LSTM step + /// 2. compute LSTM step // lstm weight : concat[forget , input , output , tilde] // shape : (D + M) x (4 * D) // fc inputX(1xM) * weightX(M*(4D)) => 1 x 4D @@ -407,6 +404,7 @@ class AttentionLSTMKernel : public framework::OpKernel { cur_hidden_out_data = cur_hidden_out_data + D; } cur_x_data = cur_x_data + seq_len * M; + cur_atten_x_data = cur_atten_x_data + seq_len; } } }; diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py index dea6ec7668..cb02c7e586 100644 --- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py @@ -40,19 +40,20 @@ def attention_lstm( D = b.shape[1] / 4 assert T == x.shape[0] assert len(fcws) == len(fcbs) - hidden = [] cell = [] start_offset = 0 for bid in range(N): seq_len = lod[0][bid] - xi = np.copy(x[start_offset:seq_len, :]).reshape(seq_len, M) + xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len, + M) prev_cell = np.copy(c0[bid]).reshape([1, D]) prev_hidden = np.copy(h0[bid]).reshape([1, D]) for step in range(seq_len): expanded_cell = np.repeat(prev_cell, seq_len, axis=0) tmp = np.concatenate((xi, expanded_cell), axis=1) + assert tmp.shape[0] == seq_len assert tmp.shape[1] == M + D for fcid in range(len(fcbs)): tmp = fc(tmp, fcws[fcid], fcbs[fcid]) @@ -62,7 +63,7 @@ def attention_lstm( lstmx = xi * tmp # seq * M lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M]) lstmin = np.concatenate((prev_hidden, lstmx), axis=1) - lstmout = np.dot(lstmin, w).reshape([1, 4 * D]) + lstmout = fc(lstmin, w, b).reshape([1, 4 * D]) g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1) g_f = act_gate(g_f).reshape([1, D]) @@ -88,7 +89,7 @@ def attention_lstm( class TestAttentionLSTMOp(OpTest): def set_conf(self): - self.lod = [[3]] + pass def setUp(self): self.op_type = 'attention_lstm' From ba168bd2d23f763f1b4c6357943da01890fc6421 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 22 Aug 2018 12:14:26 +0000 Subject: [PATCH 14/21] modify API.spec --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/stack_op.h | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9250cde1b2..c03df86e0f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -162,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) +paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index b139f48d87..c777d5feae 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -154,17 +154,22 @@ class StackKernel : public framework::OpKernel { if (std::is_same::value || n > kMaxThreshold) { #ifdef __NVCC__ + VLOG(10) << "Stack more than " << kMaxThreshold + << " tensors on GPU may be slow."; thrust::device_vector device_x_vec(x_datas); auto x_data_arr = device_x_vec.data().get(); #else auto x_data_arr = x_datas.data(); #endif StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); +#ifdef __NVCC__ + // Wait() must be called because device_x_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#endif } #ifdef __NVCC__ else { // NOLINT - VLOG(10) << "Stack more than " << kMaxThreshold - << " tensors on GPU may be slow."; framework::Array x_data_arr; for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i]; StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); @@ -243,6 +248,8 @@ class StackGradKernel : public framework::OpKernel { if (std::is_same::value || n > kMaxThreshold) { #ifdef __NVCC__ + VLOG(10) << "Stack more than " << kMaxThreshold + << " tensors on GPU may be slow."; thrust::device_vector device_dx_vec(dx_datas); auto dx_data_arr = device_dx_vec.data().get(); #else @@ -250,11 +257,14 @@ class StackGradKernel : public framework::OpKernel { #endif StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); +#ifdef __NVCC__ + // Wait() must be called because device_dx_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#endif } #ifdef __NVCC__ else { // NOLINT - VLOG(10) << "Stack more than " << kMaxThreshold - << " tensors on GPU may be slow."; framework::Array dx_data_arr; for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i]; StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, From 5ca0bb9aadd50b10dc0e20bbc528604b8937e2c1 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 23 Aug 2018 00:01:45 +0800 Subject: [PATCH 15/21] support more activation type and remove some comments --- paddle/fluid/operators/attention_lstm_op.cc | 57 ++++++++++--------- paddle/fluid/operators/math/cpu_vec.h | 26 ++++++++- .../tests/unittests/test_attention_lstm_op.py | 9 +++ 3 files changed, 63 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 5d57703c0b..1cb65346ee 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" +#include #include #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" @@ -192,24 +193,23 @@ void AttentionLSTMOpMaker::Make() { "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step." "Shape is (1 x 4D), where M is the x frame size") .AsIntermediate(); - // TODO(TJ): InEnum({"sigmoid", "tanh", "relu", "identity"}); AddAttr("gate_activation", "(string, default: sigmoid)" "The activation for input gate, forget gate and output " "gate, `sigmoid` by default.") .SetDefault("sigmoid") - .InEnum({"sigmoid"}); + .InEnum({"sigmoid", "tanh", "relu", "identity"}); AddAttr("cell_activation", "(string, default: tanh)" "The activation for cell output, `tanh` by defalut.") .SetDefault("tanh") - .InEnum({"tanh"}); + .InEnum({"sigmoid", "tanh", "relu", "identity"}); AddAttr("candidate_activation", "(string, default: tanh)" "The activation for candidate hidden state, " "`tanh` by default.") .SetDefault("tanh") - .InEnum({"tanh"}); + .InEnum({"sigmoid", "tanh", "relu", "identity"}); AddComment(R"DOC( Attention Long-Short Term Memory (LSTM) Operator. @@ -273,22 +273,23 @@ class AttentionLSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { using DeviceContext = paddle::platform::CPUDeviceContext; - auto* x = ctx.Input("X"); // T x M - auto* h0 = ctx.Input("H0"); // N x D - auto* c0 = ctx.Input("C0"); // N x D - auto* atten_w = ctx.Input("AttentionWeight"); // (M+D) x 1 - auto* atten_b = ctx.Input("AttentionBias"); // 1x1 - auto* atten_scalar = ctx.Input("AttentionScalar"); // 1x1 - auto* atten_scalar_bias = ctx.Input("AttentionScalarBias"); // 1x1 - auto* lstm_w = ctx.Input("LSTMWeight"); // (D+M) x D*4 - auto* lstm_b = ctx.Input("LSTMBias"); // 1 x D*4 - - auto* hidden_out = ctx.Output("Hidden"); // TxD - auto* cell_out = ctx.Output("Cell"); // TxD - auto* atted_x = ctx.Output("AttentionedX"); // T x 1 - auto* fc_out = ctx.Output("AttentionFCOut"); // max_seq_len x 1 - auto* lstm_x = ctx.Output("LSTMX"); // 1 x M - auto* lstm_out = ctx.Output("LSTMOUT"); // 1 x 4D + + auto* x = ctx.Input("X"); + auto* h0 = ctx.Input("H0"); + auto* c0 = ctx.Input("C0"); + auto* atten_w = ctx.Input("AttentionWeight"); + auto* atten_b = ctx.Input("AttentionBias"); + auto* atten_scalar = ctx.Input("AttentionScalar"); + auto* atten_scalar_bias = ctx.Input("AttentionScalarBias"); + auto* lstm_w = ctx.Input("LSTMWeight"); + auto* lstm_b = ctx.Input("LSTMBias"); + + auto* hidden_out = ctx.Output("Hidden"); + auto* cell_out = ctx.Output("Cell"); + auto* atted_x = ctx.Output("AttentionedX"); + auto* fc_out = ctx.Output("AttentionFCOut"); + auto* lstm_x = ctx.Output("LSTMX"); + auto* lstm_out = ctx.Output("LSTMOUT"); // some shape should be reshape here since infershape can not get lod info auto x_lod = x->lod(); @@ -310,11 +311,11 @@ class AttentionLSTMKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D); fc_out->Resize({max_seq_len, 1}); - // TODO(TJ): act functor init here - // if (platform::jit::MayIUse(platform::jit::avx2)) { - // } else if (platform::jit::MayIUse(platform::jit::avx)) { - // } else { - // } + math::VecActivations act_functor; + std::function act_gate, act_cell, act_cand; + act_gate = act_functor(ctx.Attr("gate_activation")); + act_cell = act_functor(ctx.Attr("cell_activation")); + act_cand = act_functor(ctx.Attr("candidate_activation")); const T* x_data = x->data(); const T* h0_data = h0 ? h0->data() : NULL; @@ -381,9 +382,9 @@ class AttentionLSTMKernel : public framework::OpKernel { blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data); // gate act: sigmoid - math::vec_sigmoid(D3, lstm_out_data, lstm_out_data); + act_gate(D3, lstm_out_data, lstm_out_data); // candicate act: tanh - math::vec_tanh(D, lstm_out_data + D3, lstm_out_data + D3); + act_cand(D, lstm_out_data + D3, lstm_out_data + D3); // a = forget * prev_cell blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data); @@ -395,7 +396,7 @@ class AttentionLSTMKernel : public framework::OpKernel { blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data); // state act tanh(cell_out) * output_gate - math::vec_tanh(D, cur_cell_out_data, lstm_out_data); + act_cell(D, cur_cell_out_data, lstm_out_data); blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data); prev_hidden_data = cur_hidden_out_data; diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 29476fce70..48c0da0e36 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#include #include "paddle/fluid/platform/cpu_info.h" namespace paddle { @@ -34,6 +34,12 @@ inline T tanh(T x) { return 2. * sigmoid(2. * x) - 1.; } +template +inline void vec_identity(const int n, const T* x, T* y) { + // do nothing + return; +} + template inline void vec_sigmoid(const int n, const T* x, T* y) { const T min = SIGMOID_THRESHOLD_MIN; @@ -76,6 +82,24 @@ inline void vec_relu(const int n, const float* x, } } +template +class VecActivations { + public: + std::function operator()( + const std::string& type) { + if (type == "sigmoid") { + return vec_sigmoid; + } else if (type == "relu") { + return vec_relu; + } else if (type == "tanh") { + return vec_tanh; + } else if (type == "identity" || type == "") { + return vec_identity; + } + PADDLE_THROW("Not support type %s.", type); + } +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py index cb02c7e586..a7382c2244 100644 --- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py @@ -160,6 +160,15 @@ class TestAttentionOpNonInit(TestAttentionLSTMOp): self.has_initial_hidden = False +class TestAttentionOpAct(TestAttentionLSTMOp): + def set_conf(self): + self.M = 3 + self.D = 2 + self.act_gate = 'relu' + self.act_cell = 'tanh' + self.act_cand = 'sigmoid' + + class TestAttentionOpMD1(TestAttentionLSTMOp): def set_conf(self): self.M = 36 From 4e538db14d56af761d8adb8936a7f4f7435b7187 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 23 Aug 2018 00:04:03 +0800 Subject: [PATCH 16/21] refine jit space --- paddle/fluid/platform/cpu_info.cc | 13 ++++++++++--- paddle/fluid/platform/cpu_info.h | 3 --- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 79a924434b..fcd658d67c 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -103,9 +103,8 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } -#ifdef PADDLE_WITH_XBYAK namespace jit { - +#ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { using namespace Xbyak::util; // NOLINT @@ -136,8 +135,16 @@ bool MayIUse(const cpu_isa_t cpu_isa) { } return false; } +#else +bool MayIUse(const cpu_isa_t cpu_isa) { + if (cpu_isa == isa_any) { + return true; + } else { + return false; + } +} +#endif } // namespace jit -#endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 2baa21c1bd..5d17978dd7 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -37,9 +37,7 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -#ifdef PADDLE_WITH_XBYAK namespace jit { - typedef enum { isa_any, sse42, @@ -56,7 +54,6 @@ typedef enum { inline bool MayIUse(const cpu_isa_t cpu_isa); } // namespace jit -#endif } // namespace platform } // namespace paddle From b8da70c37098beff9b5ccf3b13ac4eb6091e0f3f Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 23 Aug 2018 13:47:16 +0800 Subject: [PATCH 17/21] Resovle multi gpu async deps (#12828) * dist transpiler add control dependency var between send and recv * fix async deps * follow comments and refine * fix deps connect for rpc ops --- .../details/multi_devices_graph_pass.cc | 26 ++++++++++++++++--- paddle/fluid/framework/ir/node.cc | 2 +- paddle/fluid/framework/ir/node.h | 2 +- paddle/fluid/pybind/const_value.cc | 5 +++- python/paddle/fluid/framework.py | 6 +++++ .../fluid/transpiler/distribute_transpiler.py | 18 +++++++++++-- 6 files changed, 50 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index c5a13e7e1f..bc61b0eacb 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -763,6 +763,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, // Create RPC related op handles that connects its in ops and out ops. void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { + // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode + // put them into transpiler. int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -771,26 +773,42 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by // split_byref op - // so that we can balance the variable blocks to all the pserver - // instances. if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce && node->inputs[0]->Name().find(".block") == std::string::npos) { std::vector input_var_names; for (ir::Node *n : node->inputs) { input_var_names.push_back(n->Name()); } - op_dev_id = GetAppropriateDeviceID(input_var_names); + auto send_param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U); + op_dev_id = GetAppropriateDeviceID({send_param_grad[1]}); + VLOG(10) << "send grad " << input_var_names[0] << " origin " + << send_param_grad[1] << " place: " << op_dev_id; for (auto &varname : input_var_names) { result->Get(kShardedVarDevice) .emplace(varname, op_dev_id); } + result->Get(kShardedVarDevice) + .emplace(send_param_grad[1], op_dev_id); } } else if (node->Op()->Type() == "recv") { std::vector output_var_names; for (ir::Node *n : node->outputs) { output_var_names.push_back(n->Name()); } - op_dev_id = GetAppropriateDeviceID(output_var_names); + auto recv_param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + // FIXME(typhoonzero): assume each recv op output one param + // Use the same place as send. + if (recv_param_grad.size() == 2U) { + op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]); + VLOG(10) << "recv param " << recv_param_grad[0] + << " get grad place: " << recv_param_grad[1] + << " place: " << op_dev_id; + } else { + op_dev_id = GetAppropriateDeviceID(output_var_names); + } for (auto &varname : output_var_names) { result->Get(kShardedVarDevice) .emplace(varname, op_dev_id); diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index aca77da8d6..65c45c7d20 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -17,7 +17,7 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -const char Node::kControlDepVarName[] = "__control_var"; +constexpr char Node::kControlDepVarName[]; } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 63277d2d01..aab3180e7e 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -27,7 +27,7 @@ namespace ir { class Node { public: enum class Type { kOperation, kVariable }; - static const char kControlDepVarName[]; + static constexpr char kControlDepVarName[] = "__control_var"; explicit Node(const std::string& name, Type type) : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 9094f6051c..a81715c3b3 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -13,7 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/const_value.h" -#include +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" namespace paddle { @@ -24,6 +25,8 @@ void BindConstValue(pybind11::module* m) { m->def("kTempVarName", [] { return framework::kTempVarName; }); m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); + m->def("kControlDepVarName", + [] { return framework::ir::Node::kControlDepVarName; }); auto op_proto_and_checker_maker = m->def_submodule("op_proto_and_checker_maker"); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 389fce1874..e0ddd3b5ff 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -50,6 +50,12 @@ EMPTY_VAR_NAME = core.kEmptyVarName() TEMP_VAR_NAME = core.kTempVarName() GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() +CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() + + +def generate_control_dev_var_name(): + import random + return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random()) def grad_var_name(var_name): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 540eb8c833..80d9758b3d 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -212,8 +212,10 @@ class DistributeTranspiler(object): ps_dispatcher = self.config.split_method(self.pserver_endpoints) self.has_distributed_lookup_table = self._has_distributed_lookup_table() self.param_name_to_grad_name = dict() + self.grad_name_to_param_name = dict() for param_var, grad_var in self.params_grads: self.param_name_to_grad_name[param_var.name] = grad_var.name + self.grad_name_to_param_name[grad_var.name] = param_var.name # add distributed attrs to program self.origin_program._is_distributed = True @@ -262,8 +264,10 @@ class DistributeTranspiler(object): AssertionError("Can not insert the send op by original " "variable name :", splited_grad_varname) - dummy_output = program.global_block().create_var() + dummy_output = program.global_block().create_var( + name=framework.generate_control_dev_var_name()) grad_name_to_send_dummy_out[grad_varname] = dummy_output + program.global_block()._insert_op( index=index + 1, type="send", @@ -272,6 +276,8 @@ class DistributeTranspiler(object): attrs={ "epmap": eplist, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + OP_ROLE_VAR_ATTR_NAME: + [self.grad_name_to_param_name[grad_varname], grad_varname], "sync_mode": not self.sync_mode, }) for _, var in enumerate(splited_vars): @@ -313,6 +319,10 @@ class DistributeTranspiler(object): attrs={ "epmap": eps, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + OP_ROLE_VAR_ATTR_NAME: [ + param_varname, + self.param_name_to_grad_name[param_varname] + ], "sync_mode": not self.sync_mode }) @@ -971,7 +981,11 @@ class DistributeTranspiler(object): attrs={ "sync_mode": True, "epmap": pserver_endpoints, - RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + OP_ROLE_VAR_ATTR_NAME: [ + self.grad_name_to_param_name[table_grad_name], + table_grad_name + ] }) break From 8ad90558047fcd844db270a5744c78bf772242fb Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 23 Aug 2018 14:00:58 +0800 Subject: [PATCH 18/21] Add is_test for while_op (#12874) * add is_test for while_op * Change API --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/while_op.cc | 7 +++++++ python/paddle/fluid/layers/control_flow.py | 7 +++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9250cde1b2..bbf1623c39 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -191,7 +191,7 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 48e37796e1..65a3bc928e 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -58,11 +58,15 @@ class WhileOp : public framework::OperatorBase { PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), "Condition of while op must in CPU memory."); + bool is_test = Attr("is_test"); auto ctx = executor.Prepare(*program, block->ID()); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); executor.RunPreparedContext(ctx.get(), ¤t_scope, false); + if (is_test) { + scope.DeleteScope(¤t_scope); + } } } }; @@ -88,6 +92,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "variables generated in the i'th step."); AddAttr(kStepBlock, "The step block inside WhileOp"); + AddAttr("is_test", "True if in test phase.").SetDefault(false); AddComment(R"DOC( )DOC"); } @@ -103,6 +108,8 @@ class WhileGradOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { + PADDLE_ENFORCE(!Attr("is_test"), + "GradOp is only callable when is_test is false"); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 8bfe11916b..d2954c4c22 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -661,6 +661,7 @@ class While(object): Args: cond (Variable): condition used to compare. + is_test(bool): A flag indicating whether execution is in test phase. name (str): The name of this layer. Examples: @@ -683,7 +684,7 @@ class While(object): IN_WHILE_BLOCK = 1 AFTER_WHILE_BLOCK = 2 - def __init__(self, cond, name=None): + def __init__(self, cond, is_test=False, name=None): self.helper = LayerHelper("while", name=name) self.status = While.BEFORE_WHILE_BLOCK if not isinstance(cond, Variable): @@ -694,6 +695,7 @@ class While(object): if reduce(lambda a, b: a * b, cond.shape, 1) != 1: raise TypeError("condition should be a bool scalar") self.cond_var = cond + self.is_test = is_test def block(self): return WhileGuard(self) @@ -735,7 +737,8 @@ class While(object): }, outputs={'Out': out_vars, 'StepScopes': [step_scope]}, - attrs={'sub_block': while_block}) + attrs={'sub_block': while_block, + "is_test": self.is_test}) def lod_rank_table(x, level=0): From 9c7fde45a7fec127e3f7dc7e1c161ec647e5683b Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 23 Aug 2018 13:32:02 +0800 Subject: [PATCH 19/21] enhance test_analyzer to profile ditu inference demo --- .../ir/graph_pattern_detecter_tester.cc | 4 +- paddle/fluid/framework/selected_rows.cc | 4 +- .../inference/analysis/analyzer_tester.cc | 48 +++++++++++-------- paddle/fluid/operators/sampling_id_op.h | 2 +- paddle/scripts/paddle_build.sh | 2 - 5 files changed, 32 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc index 993c885a81..06f9df5546 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc @@ -163,8 +163,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) { // 3. Detect op2 -> var2 -> op4 // 4. Detect op2 -> var3 -> op5 // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2 - ASSERT_GE(count, 1UL); - ASSERT_LE(count, 2UL); + ASSERT_GE(count, 1); + ASSERT_LE(count, 2); } } // namespace ir diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index c202b0a5be..a4319ffabb 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -139,7 +139,7 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) { } auto write_iter = id_to_index_.find(key); if (write_iter == id_to_index_.end()) { - size_t row_num = rows_.size(); + int row_num = rows_.size(); if (row_num == value_->dims()[0]) { rwlock_->UNLock(); PADDLE_THROW("selected rows is full, then length exceed %d", row_num); @@ -182,7 +182,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], "output tensor should have the same shape with table " "except the dims[0]."); - for (size_t i = 0; i < ids.numel(); ++i) { + for (int i = 0; i < ids.numel(); ++i) { int64_t index = AutoGrownIndex(ids.data()[i], auto_grown); framework::VisitDataType( framework::ToDataType(value_->type()), diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 52f5c4f5ae..baa7600283 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -23,6 +23,8 @@ DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); +DEFINE_int32(batch_size, 10, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); namespace paddle { namespace inference { @@ -92,7 +94,7 @@ struct DataRecord { size_t batch_iter{0}; size_t batch_size{1}; DataRecord() = default; - DataRecord(const std::string &path, int batch_size = 1) + explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { Load(path); } @@ -165,7 +167,6 @@ struct DataRecord { }; void PrepareInputs(std::vector *input_slots, DataRecord *data, int batch_size) { - // DataRecord data(FLAGS_datapath, batch_size); PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, week_tensor, minute_tensor; lod_attention_tensor.name = "data_lod_attention"; @@ -174,28 +175,33 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, week_tensor.name = "week"; minute_tensor.name = "minute"; auto one_batch = data->NextBatch(); - // clang-format off - std::vector rnn_link_data_shape - ({static_cast(one_batch.rnn_link_data.size()), static_cast(one_batch.rnn_link_data.front().size())}); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); lod_attention_tensor.shape.assign({1, 2}); lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); init_zero_tensor.shape.assign({batch_size, 15}); init_zero_tensor.lod.assign({one_batch.lod3}); lod_tensor_tensor.shape = rnn_link_data_shape; lod_tensor_tensor.lod.assign({one_batch.lod1}); - week_tensor.shape.assign({(int) one_batch.rnn_week_datas.size(), (int) one_batch.rnn_week_datas.front().size()}); + // clang-format off + week_tensor.shape.assign( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); week_tensor.lod.assign({one_batch.lod3}); - minute_tensor.shape.assign({(int) one_batch.rnn_minute_datas.size(), - (int) one_batch.rnn_minute_datas.front().size()}); + minute_tensor.shape.assign( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); minute_tensor.lod.assign({one_batch.lod3}); + // clang-format on // assign data - TensorAssignData(&lod_attention_tensor, std::vector>({{0, 0}})); + TensorAssignData(&lod_attention_tensor, + std::vector>({{0, 0}})); std::vector tmp_zeros(batch_size * 15, 0.); TensorAssignData(&init_zero_tensor, {tmp_zeros}); TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); TensorAssignData(&week_tensor, one_batch.rnn_week_datas); TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); - // clang-format on // Set inputs. auto init_zero_tensor1 = init_zero_tensor; init_zero_tensor1.name = "hidden_init"; @@ -231,12 +237,9 @@ std::string DescribeTensor(const PaddleTensor &tensor) { os << "\n"; os << " - data: "; - // clang-format off - int dim = std::accumulate(tensor.shape.begin(), - tensor.shape.end(), - 1, - [](int a, int b) { return a * b; }); // clang-format on - for (size_t i = 0; i < dim; i++) { + int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, + [](int a, int b) { return a * b; }); + for (int i = 0; i < dim; i++) { os << static_cast(tensor.data.data())[i] << " "; } os << '\n'; @@ -300,13 +303,16 @@ void TestDituRNNPrediction(const std::string &model_path, for (int i = 0; i < num_times; i++) { predictor->Run(input_slots, &outputs); } - LOG(INFO) << "time/batch: " << timer.toc() / num_times; + LOG(INFO) << "===========profile result==========="; + LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times + << ", latency: " << timer.toc() / num_times << "ms"; + LOG(INFO) << "====================================="; for (auto &out : outputs) { size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, [](int a, int b) { return a * b; }); float *data = static_cast(out.data.data()); - for (int i = 0; + for (size_t i = 0; i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size); i++) { EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3); @@ -336,7 +342,7 @@ TEST(Analyzer, SupportIRPass) { // Directly infer with the original model. TEST(Analyzer, DituRNN_without_analysis) { TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - 10, false, false); + FLAGS_batch_size, false, false, FLAGS_repeat); } // Inference with the original model with the analysis turned on, the analysis @@ -344,14 +350,14 @@ TEST(Analyzer, DituRNN_without_analysis) { TEST(Analyzer, DituRNN_with_analysis) { LOG(INFO) << "ditu rnn with analysis"; TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - 10, true, false, 1); + FLAGS_batch_size, true, false, FLAGS_repeat); } // Inference with analysis and IR. The IR module will fuse some large kernels. TEST(Analyzer, DituRNN_with_analysis_with_IR) { LOG(INFO) << "ditu rnn with analysis and IR fuse"; TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - 10, true, true, 1); + FLAGS_batch_size, true, true, FLAGS_repeat); } } // namespace analysis diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h index f730a9746d..e1dd4539b3 100644 --- a/paddle/fluid/operators/sampling_id_op.h +++ b/paddle/fluid/operators/sampling_id_op.h @@ -54,7 +54,7 @@ class SamplingIdKernel : public framework::OpKernel { static_cast(context.Attr("max"))); std::vector ids(batch_size); - for (size_t i = 0; i < batch_size; ++i) { + for (int i = 0; i < batch_size; ++i) { T r = dist(engine); int idx = width - 1; for (int j = 0; j < width; ++j) { diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8460f93b84..f2a9a6b3b9 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -116,7 +116,6 @@ function cmake_gen() { -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} - -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} -DPY_VERSION=${PY_VERSION:-2.7} ======================================== EOF @@ -146,7 +145,6 @@ EOF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ - -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \ -DPY_VERSION=${PY_VERSION:-2.7} } From acdd95d5caf92f38a995bc6d2edf20a56520d799 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 23 Aug 2018 16:47:12 +0800 Subject: [PATCH 20/21] bug fix --- paddle/fluid/operators/sampling_id_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h index f730a9746d..3f7860e1fa 100644 --- a/paddle/fluid/operators/sampling_id_op.h +++ b/paddle/fluid/operators/sampling_id_op.h @@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel { break; } } - ids[i] = ins_vector[i * width + idx]; + ids[i] = ins_vector[idx]; } std::vector out_dim; From 405d6d09e1b2199711818540cfcfb87494999852 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 23 Aug 2018 19:26:28 +0800 Subject: [PATCH 21/21] Fix doc typo. (#12863) --- doc/fluid/dev/new_op_cn.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md index c00f73be95..ff7408111f 100644 --- a/doc/fluid/dev/new_op_cn.md +++ b/doc/fluid/dev/new_op_cn.md @@ -36,19 +36,19 @@ OpProtoMake定义 -`.cc`文件,Backward Op不需要定义OpProtoMake +.cc 文件,Backward Op不需要定义OpProtoMake Op定义 - `.cc`文件 + .cc 文件 Kernel实现 - CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 + CPU、CUDA共享Kernel实现在.h 文件中,否则,CPU 实现在.cc 文件中,CUDA 实现在.cu 文件中。 注册Op - Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 + Op注册实现在.cc 文件;Kernel注册CPU实现在.cc 文件中,CUDA实现在.cu 文件中 @@ -391,7 +391,7 @@ PADDLE_ENFORCE(ctx->HasInput("X"), ""); ``` 问题示例2 :提示信息过于简单 ``` -PADDLE_ENFORCE(i != nullptr, "I must be set"); // I是什么? +PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么? ``` 2. 在报错信息中使用开发人员定义的变量缩写,不易理解!