add jit kernel hsum, hmax and softmax refer code

test=develop
7 years ago · 8117725852
parent 67e4450c34
commit 8117725852
8 changed files with 269 additions and 121 deletions
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
 using Tensor = paddle::framework::Tensor;
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYZNKernel() {
  for (int d : TestSizes()) {
    Tensor x, y, z;
@ -175,7 +175,7 @@ void BenchXYZNKernel() {
  }
 }
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchAXYNKernel() {
  for (int d : TestSizes()) {
    const T a = static_cast<T>(3);
@ -190,7 +190,17 @@ void BenchAXYNKernel() {
  }
 }
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchXRNKernel() {
  for (int d : TestSizes()) {
    Tensor x;
    RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
    T res;
    BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d);
  }
 }
 template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYNKernel() {
  for (int d : TestSizes()) {
    Tensor x, y;
@ -203,7 +213,7 @@ void BenchXYNKernel() {
  }
 }
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchLSTMKernel() {
  for (bool use_peephole : {true, false}) {
    for (int d : TestSizes()) {
@ -240,7 +250,7 @@ void BenchLSTMKernel() {
  }
 }
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchGRUKernel() {
  for (int d : TestSizes()) {
    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
@ -262,7 +272,7 @@ void BenchGRUKernel() {
  }
 }
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchSeqPoolKernel() {
  std::vector<jit::SeqPoolType> pool_types = {
      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
@ -284,7 +294,7 @@ void BenchSeqPoolKernel() {
  }
 }
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
  for (int m : {1, 2, 3, 4}) {
    for (int n : TestSizes()) {
@ -305,57 +315,64 @@ void BenchMatMulKernel() {
  }
 }
 template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchSoftmaxKernel() {
  for (int bs : {1, 2, 10}) {
    for (int n : TestSizes()) {
      Tensor x, y;
      x.Resize({bs, n});
      y.Resize({bs, n});
      RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
      const T* x_data = x.data<T>();
      T* y_data = y.mutable_data<T>(PlaceType());
      BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
                                                          bs);
    }
  }
 }
 using T = float;
-using PlaceType = paddle::platform::CPUPlace;
+using CPUPlace = paddle::platform::CPUPlace;
 // xyzn
-BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, PlaceType>(); }
+BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); }
-
+BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, PlaceType>(); }
+BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
-
+BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>(); }
 BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, PlaceType>(); }
 // axyn
-BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, PlaceType>(); }
+BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, PlaceType>(); }
+// xrn
 BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); }
 BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
 // xyn
-BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, PlaceType>(); }
+BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); }
-
+BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, PlaceType>(); }
+BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
-
+BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
-BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, PlaceType>(); }
+BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
-
+BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, PlaceType>(); }
 BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, PlaceType>(); }
 BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, PlaceType>(); }
 // lstm and peephole
-BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>(); }
+BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
-
+BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
 BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>(); }
 // gru functions
-BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); }
+BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
-
+BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
-BENCH_FP32_CPU(kGRUHtPart1) {
+BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
 }
 BENCH_FP32_CPU(kGRUHtPart2) {
  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
 }
 // seq pool function
-BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>(); }
+BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
 // matmul
-BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, PlaceType>(); }
+BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
 // softmax
 BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@ -49,6 +49,9 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kNCHW16CMulNC);
    ONE_CASE(kSeqPool);
    ONE_CASE(kMatMul);
    ONE_CASE(kHMax);
    ONE_CASE(kHSum);
    ONE_CASE(kSoftmax);
    default:
      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
      return "NOT JITKernel";
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@ -20,6 +20,7 @@ namespace paddle {
 namespace operators {
 namespace jit {
 // TODO(TJ): reorder by alphabet
 typedef enum {
  kNone = 0,
  kVMul = 1,
@ -44,6 +45,9 @@ typedef enum {
  kNCHW16CMulNC,
  kSeqPool,
  kMatMul,
  kHSum,  // horizontal max
  kHMax,  // horizontal sum
  kSoftmax,
 } KernelType;
 typedef enum {
@ -70,6 +74,10 @@ struct XYNTuples {
  typedef void (*func_type)(const T*, T*, int);
 };
 // x, return and int
 template <typename T>
 struct XRNTuples : public XYNTuples<T> {};
 typedef struct {
  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
  const void* ct_1;
@ -159,6 +167,13 @@ struct LayerNormTuples {
                            const float, int);
 };
 template <typename T>
 struct SoftmaxTuples {
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, T*, int, int);
 };
 // nChw16c = nChw16c .* NC
 template <typename T>
 struct NCHW16CMulNCTuples {
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC)
 USE_JITKERNEL_REFER(kSeqPool)
 USE_JITKERNEL_REFER(kMatMul)
 USE_JITKERNEL_REFER(kVSquare)
 USE_JITKERNEL_REFER(kHSum)
 USE_JITKERNEL_REFER(kHMax)
 USE_JITKERNEL_REFER(kSoftmax)
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
 REGISTER_REFER_KERNEL(kMatMul, MatMul);
 REGISTER_REFER_KERNEL(kHMax, HMax);
 REGISTER_REFER_KERNEL(kHSum, HSum);
 REGISTER_REFER_KERNEL(kSoftmax, Softmax);
 #undef REGISTER_REFER_KERNEL
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@ -378,6 +378,40 @@ void MatMul(const T* A, const T* B, T* C, int M, int N, int K) {
  }
 }
 template <typename T>
 void HMax(const T* x, T* res, int n) {
  res[0] = x[0];
  for (int i = 1; i < n; ++i) {
    res[0] = res[0] < x[i] ? x[i] : res[0];
  }
 }
 template <typename T>
 void HSum(const T* x, T* res, int n) {
  res[0] = x[0];
  for (int i = 1; i < n; ++i) {
    res[0] += x[i];
  }
 }
 // y = e^(x - max(x))
 // y = y / sum(y)
 template <typename T>
 void Softmax(const T* x, T* y, int n, int bs = 1) {
  for (int i = 0; i < bs; ++i) {
    T scalar;
    HMax(x, &scalar, n);
    scalar = static_cast<T>(0) - scalar;
    VAddBias(&scalar, x, y, n);  // x - max
    VExp(y, y, n);
    HSum(y, &scalar, n);
    scalar = static_cast<T>(1) / scalar;
    VScal(&scalar, y, y, n);
    x += n;
    y += n;
  }
 }
 #define DECLARE_REFER_KERNEL(name, tuples)             \
  template <typename T>                                \
  class name##Kernel : public ReferKernel<tuples<T>> { \
@ -421,6 +455,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
 DECLARE_REFER_KERNEL(MatMul, MatMulTuples);
 DECLARE_REFER_KERNEL(HMax, XRNTuples);
 DECLARE_REFER_KERNEL(HSum, XRNTuples);
 DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
 #undef DECLARE_REFER_KERNEL
 }  // namespace refer
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@ -70,6 +70,8 @@ extern void* mklml_dso_handle;
  __macro(cblas_ddot);              \
  __macro(cblas_sasum);             \
  __macro(cblas_dasum);             \
  __macro(cblas_isamax);            \
  __macro(cblas_idamax);            \
  __macro(cblas_sscal);             \
  __macro(cblas_dscal);             \
  __macro(vsAdd);                   \