From 2139b9f6773b6370e7c48d66e8897d259130e06e Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 1 Nov 2018 02:12:08 +0000 Subject: [PATCH 1/4] add jit gencode --- paddle/fluid/operators/math/CMakeLists.txt | 4 +- paddle/fluid/operators/math/jit_gen.cc | 90 ++++++++++++++++++++++ paddle/fluid/operators/math/jit_gen.h | 80 +++++++++++++++++++ paddle/fluid/operators/math/jit_kernel.h | 1 + 4 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/math/jit_gen.cc create mode 100644 paddle/fluid/operators/math/jit_gen.h diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 17b675fba8..d24b6fc6a2 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -76,6 +76,6 @@ endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_library(jit_kernel - SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc - DEPS cpu_info cblas) + SRCS jit_kernel.cc jit_gen.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc + DEPS cpu_info cblas gflags) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc new file mode 100644 index 0000000000..6af39518ed --- /dev/null +++ b/paddle/fluid/operators/math/jit_gen.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_gen.h" +#include +#include +#include +#include "paddle/fluid/platform/cpu_info.h" + +DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace gen { + +constexpr Xbyak::Operand::Code g_abi_regs[] = { + Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, + Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15}; + +constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]); + +void JitCode::preCode() { + for (int i = 0; i < num_g_abi_regs; ++i) { + push(Xbyak::Reg64(g_abi_regs[i])); + } + if (platform::jit::MayIUse(platform::jit::avx512f)) { + mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); + } +} + +void JitCode::postCode() { + for (int i = 0; i < num_g_abi_regs; ++i) { + pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i])); + } + ret(); +} + +void JitCode::dumpCode(const Xbyak::uint8 *code) const { + if (code) { + static int counter = 0; + std::ostringstream filename; + filename << "paddle_jitcode_" << name() << "." << counter << ".bin"; + counter++; + std::ofstream fout(filename.str(), std::ios::out); + if (fout.is_open()) { + fout.write(reinterpret_cast(code), getSize()); + fout.close(); + } + } +} + +Xbyak::Address JitCode::EVEX_compress_addr(Xbyak::Reg64 base, int offt, + bool bcast) { + int scale = 0; + if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { + offt = offt - 2 * EVEX_max_8b_offt; + scale = 1; + } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { + offt = offt - 4 * EVEX_max_8b_offt; + scale = 2; + } + auto re = Xbyak::RegExp() + base + offt; + if (scale) { + re = re + reg_EVEX_max_8b_offt * scale; + } + if (bcast) { + return zword_b[re]; + } else { + return zword[re]; + } +} + +} // namespace gen +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h new file mode 100644 index 0000000000..6abf3434cc --- /dev/null +++ b/paddle/fluid/operators/math/jit_gen.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/platform/macros.h" + +#define XBYAK_USE_MMAP_ALLOCATOR +#include "xbyak/xbyak.h" +#include "xbyak/xbyak_util.h" + +DECLARE_bool(dump_jitcode); + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace gen { + +#define DECLARE_JIT_CODE(codename) \ + const char *name() const override { return #codename; } + +// Application Binary Interface +constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), + abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), + abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); + +class JitCode : public Xbyak::CodeGenerator { + public: + explicit JitCode(size_t code_size = 256 * 1024, void *code_ptr = nullptr) + : Xbyak::CodeGenerator(code_size, code_ptr) {} + + virtual ~JitCode() {} + virtual const char *name() const = 0; + virtual void generate() = 0; + + template + const FUNC getCode() { + this->generate(); + const Xbyak::uint8 *code = CodeGenerator::getCode(); + if (FLAGS_dump_jitcode) { + this->dumpCode(code); + } + return reinterpret_cast(code); + } + DISABLE_COPY_AND_ASSIGN(JitCode); + + protected: + Xbyak::Reg64 param1{abi_param1}; + const int EVEX_max_8b_offt = 0x200; + const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; + + void preCode(); + void postCode(); + void dumpCode(const Xbyak::uint8 *code) const; + void L(const char *label) { Xbyak::CodeGenerator::L(label); } + void L(const Xbyak::Label &label) { Xbyak::CodeGenerator::L(label); } + // Enhanced vector extension + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt, + bool bcast = false); +}; + +} // namespace gen +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 48e180b1fd..dff05ae6f6 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -40,6 +40,7 @@ class Kernel { Kernel() = default; virtual ~Kernel() = default; int num_{0}; + // TODO(TJ): below two should be reomved. int end_{0}; int rest_{0}; DISABLE_COPY_AND_ASSIGN(Kernel); From a53b1b0b1b8751839c7d34da7883bc31abe8c0a8 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 1 Nov 2018 02:13:04 +0000 Subject: [PATCH 2/4] refine and init jitkernel vmul --- paddle/fluid/operators/math/CMakeLists.txt | 2 +- paddle/fluid/operators/math/jit_kernel.h | 4 +- .../fluid/operators/math/jit_kernel_blas.cc | 141 +++++++++++------- .../operators/math/jit_kernel_crf_decode.cc | 2 +- paddle/fluid/operators/math/jit_kernel_exp.cc | 6 +- .../fluid/operators/math/jit_kernel_macro.h | 125 ++++++++++++---- paddle/fluid/operators/math/jit_kernel_rnn.cc | 40 ++--- .../fluid/operators/math/jit_kernel_test.cc | 14 +- 8 files changed, 215 insertions(+), 119 deletions(-) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d24b6fc6a2..7f79974248 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -77,5 +77,5 @@ cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_library(jit_kernel SRCS jit_kernel.cc jit_gen.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc - DEPS cpu_info cblas gflags) + DEPS cpu_info cblas gflags enforce) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index dff05ae6f6..7b6027aa26 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -39,8 +39,8 @@ class Kernel { public: Kernel() = default; virtual ~Kernel() = default; + // TODO(TJ): below members should be deprecated. int num_{0}; - // TODO(TJ): below two should be reomved. int end_{0}; int rest_{0}; DISABLE_COPY_AND_ASSIGN(Kernel); @@ -65,7 +65,7 @@ class KernelPool { template class VMulKernel : public Kernel { public: - virtual void Compute(const T *x, const T *y, T *z) const = 0; + void (*Compute)(const T *, const T *, T *, int); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index c88b17b012..7f92043b6f 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -14,7 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include +#include "paddle/fluid/operators/math/jit_gen.h" #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/platform/enforce.h" + #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" #endif @@ -28,64 +31,97 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; +namespace jit = platform::jit; // remove me + +using namespace platform::jit; // NOLINT /* VMUL JitKernel */ -template -class VMulKernelImpl : public VMulKernel { - public: - explicit VMulKernelImpl(int d) : VMulKernel() { this->num_ = d; } - void Compute(const T* x, const T* y, T* z) const override { - for (int i = 0; i < this->num_; ++i) { - z[i] = x[i] * y[i]; +struct VMulJitCode : public gen::JitCode { + DECLARE_JIT_CODE(VMulJitCode); + explicit VMulJitCode(size_t code_size = 256 * 1024, void* code_ptr = nullptr) + : gen::JitCode(code_size, code_ptr) {} + static bool init(int d) { + if (MayIUse(avx) || MayIUse(avx2)) { + return d % AVX_FLOAT_BLOCK == 0; + } else if (MayIUse(avx512f)) { + return d % AVX512_FLOAT_BLOCK == 0; + } else { + return false; } } + void generate() override { + preCode(); + postCode(); + } }; -#ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VMulKernelImpl::Compute( \ - const float* x, const float* y, float* z) const { \ - platform::dynload::vsMul(this->num_, x, y, z); \ +template +void VMulRefer(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; } +} -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VMulKernelImpl::Compute( \ - const double* x, const double* y, double* z) const { \ - platform::dynload::vdMul(this->num_, x, y, z); \ - } - -FOR_EACH_ISA(MKL_FLOAT, kGT16); -FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#ifdef PADDLE_WITH_MKLML +template +void VMulMKL(const T* x, const T* y, T* z, int n); + +template <> +void VMulMKL(const float* x, const float* y, float* z, int n) { + platform::dynload::vsMul(n, x, y, z); +} +template <> +void VMulMKL(const double* x, const double* y, double* z, int n) { + platform::dynload::vdMul(n, x, y, z); +} #endif -#define INTRI8_FLOAT(isa) \ - template <> \ - void VMulKernelImpl::Compute( \ - const float* x, const float* y, float* z) const { \ - __m256 tmpx, tmpy; \ - tmpx = _mm256_loadu_ps(x); \ - tmpy = _mm256_loadu_ps(y); \ - tmpx = _mm256_mul_ps(tmpx, tmpy); \ - _mm256_storeu_ps(z, tmpx); \ +template +class VMulKernelImpl : public VMulKernel { + public: + static inline std::string name(int d) { + PADDLE_THROW("DType should be either float or double"); } - -// avx > for > mkl -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + + explicit VMulKernelImpl(int d) : VMulKernel() { + if (useJIT(d)) { + constexpr size_t sz = 256 * 1024; // TODO(TJ): should be related with d + jitcode_.reset(new VMulJitCode(sz)); + this->Compute = + jitcode_->getCode(); + return; + } +#ifdef PADDLE_WITH_MKLML + if (useMKL(d)) { + this->Compute = VMulMKL; + return; + } #endif -// TODO(TJ): eq16 test and complete avx512 -#undef INTRI8_FLOAT -#undef MKL_FLOAT -#undef MKL_DOUBLE + this->Compute = VMulRefer; + } + + private: + std::unique_ptr jitcode_{nullptr}; +}; + +template <> +bool VMulKernelImpl::useJIT(int d) { + return VMulJitCode::init(d); +} + +template <> +bool VMulKernelImpl::useMKL(int d) { + return jit::MayIUse(jit::avx512f) && d > 512; +} + +template <> +bool VMulKernelImpl::useMKL(int d) { + return true; +} + +REGISTER_JITKERNEL(vmul, VMulKernel); /* VADD JitKernel */ template @@ -465,13 +501,12 @@ INTRI_COMMON_FLOAT(jit::avx512f, kGT16); #undef INTRI16_FLOAT #undef INTRI_COMMON_FLOAT -REGISTER_JITKERNEL(vmul, VMulKernel); -REGISTER_JITKERNEL(vadd, VAddKernel); -REGISTER_JITKERNEL(vscal, VScalKernel); -REGISTER_JITKERNEL(vaddb, VAddBiasKernel); -REGISTER_JITKERNEL(vrelu, VReluKernel); -REGISTER_JITKERNEL(vaddrelu, VAddReluKernel); -REGISTER_JITKERNEL(videntity, VIdentityKernel); +REGISTER_JITKERNEL_DEPRECATED(vadd, VAddKernel); +REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel); +REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel); +REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel); +REGISTER_JITKERNEL_DEPRECATED(vaddrelu, VAddReluKernel); +REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel); } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index e481d1921a..a4861c347e 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -288,7 +288,7 @@ INTRIAVX512_FLOAT(kGT16); #undef INIT_ALPHA #undef UPDATE_ALPHA -REGISTER_JITKERNEL(crf_decode, CRFDecodeKernel); +REGISTER_JITKERNEL_DEPRECATED(crf_decode, CRFDecodeKernel); } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index c4247580f4..d7c177e678 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -250,7 +250,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); #undef MKL_FLOAT #undef MKL_DOUBLE -REGISTER_JITKERNEL(vexp, VExpKernel); +REGISTER_JITKERNEL_DEPRECATED(vexp, VExpKernel); /* VSigmoid JitKernel */ template @@ -396,7 +396,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); #undef INTRI_GT16_FLOAT #undef INTRI_VSIGMOID -REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); +REGISTER_JITKERNEL_DEPRECATED(vsigmoid, VSigmoidKernel); /* VTanh JitKernel */ template @@ -531,7 +531,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); #undef INTRI_GT16_FLOAT #undef INTRI_VTANH -REGISTER_JITKERNEL(vtanh, VTanhKernel); +REGISTER_JITKERNEL_DEPRECATED(vtanh, VTanhKernel); #undef JITKERNEL_NEW_ACT_IMPL diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index d8e55f2673..a8169ea48a 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -21,8 +21,71 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; +#define JITKERNEL_DEFINE_NAME(ker_key, ker_class) \ + template <> \ + std::string ker_class##Impl::name(int d) { \ + std::string key(#ker_key "f"); \ + if (useJIT(d)) { \ + /* only jit code need record d*/ \ + return key + "jit" + std::to_string(d); \ + } else if (useMKL(d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } \ + template <> \ + std::string ker_class##Impl::name(int d) { \ + std::string key(#ker_key "d"); \ + /* jit code do not support double yet*/ \ + if (useMKL(d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } + +#define JITKERNEL_DECLARE(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, int>(int d) + +#define JITKERNEL_FIND_KEY(ker_class, ker_dtype) \ + std::string key = ker_class##Impl::name(d) + +#define JITKERNEL_IMPL(ker_class, ker_dtype) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(d)) + +#define REGISTER_JITKERNEL_WITH_DTYPE(ker_class, ker_dtype, marco_declare, \ + macro_find_key, macro_impl) \ + marco_declare(ker_class, ker_dtype) { \ + macro_find_key(ker_class, ker_dtype); \ + if (kers_.find(key) == kers_.end()) { \ + std::shared_ptr> p; \ + macro_impl(ker_class, ker_dtype); \ + kers_.insert({key, std::dynamic_pointer_cast(p)}); \ + return p; \ + } \ + return std::dynamic_pointer_cast>( \ + kers_.at(key)); \ + } +#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name, \ + marco_declare, macro_find_key, macro_impl) \ + marco_define_name(ker_key, ker_class); \ + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE, \ + JITKERNEL_FIND_KEY, JITKERNEL_IMPL); \ + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE, \ + JITKERNEL_FIND_KEY, JITKERNEL_IMPL) + +#define REGISTER_JITKERNEL(ker_key, ker_class) \ + REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \ + JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ + JITKERNEL_IMPL) + +namespace jit = platform::jit; +// TODO(TJ): below defines are deprecated, would be remove recently #define SEARCH_BLOCK(macro_, ker, dtype, isa) \ if (d < AVX_FLOAT_BLOCK) { \ macro_(ker, dtype, isa, kLT8); \ @@ -47,44 +110,42 @@ namespace jit = platform::jit; SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ } -#define JITKERNEL_DECLARE(ker_class, ker_dtype) \ - template <> \ - std::shared_ptr> \ - KernelPool::Get, int>(int d) - #define JITKERNEL_KEY(ker_key, dtype_key) \ #ker_key #dtype_key + std::to_string(d) -#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \ - p = std::dynamic_pointer_cast>( \ +#define JITKERNEL_NEW_IMPL_DEPRECATED(ker, dtype, isa, k) \ + p = std::dynamic_pointer_cast>( \ std::make_shared>(d)) -#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \ - marco_declare, macro_key, macro_impl) \ - marco_declare(ker_class, ker_dtype) { \ - std::string key = macro_key(ker_key, dtype_key); \ - if (kers_.find(key) == kers_.end()) { \ - std::shared_ptr> p; \ - SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \ - kers_.insert({key, std::dynamic_pointer_cast(p)}); \ - return p; \ - } \ - return std::dynamic_pointer_cast>( \ - kers_.at(key)); \ +#define JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, ker_dtype, \ + dtype_key, marco_declare, macro_key, \ + macro_impl) \ + marco_declare(ker_class, ker_dtype) { \ + std::string key = macro_key(ker_key, dtype_key); \ + if (kers_.find(key) == kers_.end()) { \ + std::shared_ptr> p; \ + SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \ + kers_.insert({key, std::dynamic_pointer_cast(p)}); \ + return p; \ + } \ + return std::dynamic_pointer_cast>( \ + kers_.at(key)); \ } -#define REGISTER_JITKERNEL(ker_key, ker_class) \ - JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \ - JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \ - JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \ - JITKERNEL_KEY, JITKERNEL_NEW_IMPL) - -#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \ - macro_impl) \ - JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \ - macro_impl); \ - JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \ - macro_key, macro_impl) +#define REGISTER_JITKERNEL_DEPRECATED(ker_key, ker_class) \ + JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f, \ + JITKERNEL_DECLARE, JITKERNEL_KEY, \ + JITKERNEL_NEW_IMPL_DEPRECATED); \ + JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d, \ + JITKERNEL_DECLARE, JITKERNEL_KEY, \ + JITKERNEL_NEW_IMPL_DEPRECATED) + +#define REGISTER_JITKERNEL_ARGS_DEPRECATED(ker_key, ker_class, marco_declare, \ + macro_key, macro_impl) \ + JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f, marco_declare, \ + macro_key, macro_impl); \ + JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d, \ + marco_declare, macro_key, macro_impl) #define FOR_EACH_ISA(macro_, block) \ macro_(jit::avx512f, block); \ diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index fab293f7d0..d0932a37bb 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -179,23 +179,23 @@ class LSTMKernelImpl : public LSTMKernel { /* C_t = C_t-1 * fgated + cand_gated * igated */ act_cand_d_->Compute(gates, gates); - vmul_d_->Compute(gates, gates + d_, gates + d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vmul_d_->Compute(gates, gates + d_, gates + d_, d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vadd_d_->Compute(gates + d_, gates + d2_, ct); /* H_t = act_cell(C_t) * ogated */ act_cell_d_->Compute(ct, gates + d2_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht); + vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { /* C_t = igated * cgated*/ act_gate_d_->Compute(gates + d_, gates + d_); act_cand_d_->Compute(gates, gates); - vmul_d_->Compute(gates, gates + d_, ct); + vmul_d_->Compute(gates, gates + d_, ct, d_); /* H_t = act_cell(C_t) * ogated */ act_gate_d_->Compute(gates + d3_, gates + d3_); act_cell_d_->Compute(ct, gates + d2_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht); + vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } private: @@ -289,36 +289,36 @@ class PeepholeKernelImpl : public LSTMKernel { void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, T* checked) const override { /* get fgated and igated*/ - vmul_d_->Compute(wp_data, ct_1, checked); - vmul_d_->Compute(wp_data + d_, ct_1, checked + d_); + vmul_d_->Compute(wp_data, ct_1, checked, d_); + vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); vadd_d2_->Compute(checked, gates + d_, gates + d_); act_gate_d2_->Compute(gates + d_, gates + d_); /* C_t = C_t-1 * fgated + cand_gated * igated*/ act_cand_d_->Compute(gates, gates); - vmul_d_->Compute(gates, gates + d_, gates + d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vmul_d_->Compute(gates, gates + d_, gates + d_, d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vadd_d_->Compute(gates + d_, gates + d2_, ct); /* get ogated*/ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); act_gate_d_->Compute(gates + d3_, gates + d3_); /* H_t = act_cell(C_t) * ogated */ act_cell_d_->Compute(ct, gates + d2_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht); + vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { /* C_t = igated * cgated*/ act_gate_d_->Compute(gates + d_, gates + d_); act_cand_d_->Compute(gates, gates); - vmul_d_->Compute(gates, gates + d_, ct); + vmul_d_->Compute(gates, gates + d_, ct, d_); /* get outgated, put W_oc * C_t on igated */ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); /* H_t = act_cell(C_t) * ogated */ act_gate_d_->Compute(gates + d3_, gates + d3_); act_cell_d_->Compute(ct, gates + d2_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht); + vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } private: @@ -352,8 +352,8 @@ class PeepholeKernelImpl : public LSTMKernel { act_cell, d)); \ } -REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, - JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); +REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, + JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); #undef INTRI8_FLOAT #undef JITKERNEL_DECLARE_LSTM @@ -378,13 +378,13 @@ class GRUKernelImpl : public GRUKernel { void ComputeH1(T* gates, T* ht) const override { act_gate_d_->Compute(gates, gates); act_state_d_->Compute(gates + d2_, gates + d2_); - vmul_d_->Compute(gates, gates + d2_, ht); + vmul_d_->Compute(gates, gates + d2_, ht, d_); } void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { // W: {W_update, W_reset; W_state} act_gate_d2_->Compute(gates, gates); - vmul_d_->Compute(ht_1, gates + d_, ht); + vmul_d_->Compute(ht_1, gates + d_, ht, d_); } void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override { @@ -472,8 +472,8 @@ INTRI8_FLOAT(jit::avx512f); p = std::dynamic_pointer_cast>( \ std::make_shared>(act_gate, act_state, d)); -REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DECLARE_GRU, - JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL); +REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU, + JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL); #undef INTRI8_FLOAT #undef JITKERNEL_NEW_GRU_IMPL diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index c9e6ab740d..cf0d6c60d1 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -369,12 +369,12 @@ void lstm_ctht_better( int d2 = d * 2; vsigmoid_3d->Compute(gates + d, gates + d); vtanh_d->Compute(gates, gates); - vmul_d->Compute(gates, gates + d, gates + d); - vmul_d->Compute(ct_1, gates + d2, gates + d2); + vmul_d->Compute(gates, gates + d, gates + d, d); + vmul_d->Compute(ct_1, gates + d2, gates + d2, d); vadd_d->Compute(gates + d, gates + d2, ct); /* H_t = act_cell(C_t) * ogated */ vtanh_d->Compute(ct, gates + d2); - vmul_d->Compute(gates + d2, gates + d * 3, ht); + vmul_d->Compute(gates + d2, gates + d * 3, ht, d); } TEST(JitKernel, lstm) { @@ -578,7 +578,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) { TEST(JitKernel, vmul) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 15, 16, 30, 256, 512}) { + for (int d : {7, 8, 15, 16, 30, 256, 512, 1000, 1024}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); RandomVec(d, x.data()); @@ -616,7 +616,7 @@ TEST(JitKernel, vmul) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(x_data, y_data, ztgt_data); + ker->Compute(x_data, y_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); @@ -800,8 +800,8 @@ TEST(JitKernel, pool) { EXPECT_TRUE(std::dynamic_pointer_cast(pvmul_f) != std::dynamic_pointer_cast(pvmul_d)); - const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4"); + const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulfany"); EXPECT_EQ(pvmul_f, pvmul_from_key); - const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5"); + const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulfjit"); EXPECT_TRUE(pvmul_from_key2 == nullptr); } From a3377f7b0abe3c5678ba12258edfe33a7dcd8600 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 1 Nov 2018 08:05:01 +0000 Subject: [PATCH 3/4] refine jitcode and add vmul jitcode implementation --- paddle/fluid/operators/math/CMakeLists.txt | 2 +- paddle/fluid/operators/math/jit_code.cc | 53 ++++++++++++++++ paddle/fluid/operators/math/jit_code.h | 63 +++++++++++++++++++ .../fluid/operators/math/jit_kernel_blas.cc | 34 ++-------- 4 files changed, 123 insertions(+), 29 deletions(-) create mode 100644 paddle/fluid/operators/math/jit_code.cc create mode 100644 paddle/fluid/operators/math/jit_code.h diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 7f79974248..c1d4cc1b88 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -76,6 +76,6 @@ endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_library(jit_kernel - SRCS jit_kernel.cc jit_gen.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc + SRCS jit_kernel.cc jit_gen.cc jit_code.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc DEPS cpu_info cblas gflags enforce) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc new file mode 100644 index 0000000000..29a89bca98 --- /dev/null +++ b/paddle/fluid/operators/math/jit_code.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_code.h" +#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace gen { + +using namespace platform::jit; // NOLINT + +bool VMulJitCode::init(int d) { + // TODO(TJ): maybe one AVX is enough, AVX above would slow down freq + // try more with avx2 or avx512 + if (MayIUse(avx) || MayIUse(avx2)) { + return d % AVX_FLOAT_BLOCK == 0; + } else { + return false; + } +} + +void VMulJitCode::generate() { + preCode(); + int stride = sizeof(float) * AVX_FLOAT_BLOCK; + for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { + vmovups(ymm_src1, ptr[param1 + i * stride]); + vmovups(ymm_src2, ptr[param2 + i * stride]); + vmulps(ymm_dst, ymm_src1, ymm_src2); + vmovups(ptr[param3 + stride * i], ymm_dst); + } + postCode(); +} + +} // namespace gen +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h new file mode 100644 index 0000000000..db1a0cd095 --- /dev/null +++ b/paddle/fluid/operators/math/jit_code.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/jit_gen.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace gen { + +using reg64_t = const Xbyak::Reg64; +using reg32_t = const Xbyak::Reg32; +using xmm_t = const Xbyak::Xmm; +using ymm_t = const Xbyak::Ymm; +using zmm_t = const Xbyak::Zmm; +using Label = Xbyak::Label; + +class VMulJitCode : public JitCode { + public: + DECLARE_JIT_CODE(VMulJitCode); + explicit VMulJitCode(int d, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), num_(d) {} + static bool init(int d); + void generate() override; + + private: + int num_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + reg64_t param3{abi_param3}; + + xmm_t xmm_src1 = xmm_t(0); + ymm_t ymm_src1 = ymm_t(0); + zmm_t zmm_src1 = zmm_t(0); + xmm_t xmm_src2 = xmm_t(1); + ymm_t ymm_src2 = ymm_t(1); + zmm_t zmm_src2 = zmm_t(1); + + xmm_t xmm_dst = xmm_t(2); + ymm_t ymm_dst = ymm_t(2); + zmm_t zmm_dst = zmm_t(2); +}; + +} // namespace gen +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index 7f92043b6f..cef21348e4 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include -#include "paddle/fluid/operators/math/jit_gen.h" +#include "paddle/fluid/operators/math/jit_code.h" #include "paddle/fluid/operators/math/jit_kernel_macro.h" #include "paddle/fluid/platform/enforce.h" @@ -30,30 +30,7 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { - -namespace jit = platform::jit; // remove me - -using namespace platform::jit; // NOLINT - -/* VMUL JitKernel */ -struct VMulJitCode : public gen::JitCode { - DECLARE_JIT_CODE(VMulJitCode); - explicit VMulJitCode(size_t code_size = 256 * 1024, void* code_ptr = nullptr) - : gen::JitCode(code_size, code_ptr) {} - static bool init(int d) { - if (MayIUse(avx) || MayIUse(avx2)) { - return d % AVX_FLOAT_BLOCK == 0; - } else if (MayIUse(avx512f)) { - return d % AVX512_FLOAT_BLOCK == 0; - } else { - return false; - } - } - void generate() override { - preCode(); - postCode(); - } -}; +namespace jit = platform::jit; template void VMulRefer(const T* x, const T* y, T* z, int n) { @@ -76,6 +53,7 @@ void VMulMKL(const double* x, const double* y, double* z, int n) { } #endif +/* VMUL JitKernel */ template class VMulKernelImpl : public VMulKernel { public: @@ -88,7 +66,7 @@ class VMulKernelImpl : public VMulKernel { explicit VMulKernelImpl(int d) : VMulKernel() { if (useJIT(d)) { constexpr size_t sz = 256 * 1024; // TODO(TJ): should be related with d - jitcode_.reset(new VMulJitCode(sz)); + jitcode_.reset(new gen::VMulJitCode(d, sz)); this->Compute = jitcode_->getCode(); return; @@ -103,12 +81,12 @@ class VMulKernelImpl : public VMulKernel { } private: - std::unique_ptr jitcode_{nullptr}; + std::unique_ptr jitcode_{nullptr}; }; template <> bool VMulKernelImpl::useJIT(int d) { - return VMulJitCode::init(d); + return gen::VMulJitCode::init(d); } template <> From 85bcb286f5645ad81f67a86ada916ed8d0f8931b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 1 Nov 2018 15:19:17 +0000 Subject: [PATCH 4/4] refine vmul jitcode test=develop --- paddle/fluid/operators/math/jit_code.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 29a89bca98..06cf82513d 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -35,7 +35,7 @@ bool VMulJitCode::init(int d) { } void VMulJitCode::generate() { - preCode(); + // do not need push stack, and do not need save avx512reg if do not use avx512 int stride = sizeof(float) * AVX_FLOAT_BLOCK; for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { vmovups(ymm_src1, ptr[param1 + i * stride]); @@ -43,7 +43,7 @@ void VMulJitCode::generate() { vmulps(ymm_dst, ymm_src1, ymm_src2); vmovups(ptr[param3 + stride * i], ymm_dst); } - postCode(); + ret(); } } // namespace gen