Add ceil model pooling for trt (ocr attention)

test=develop
release/1.1
nhzlx 7 years ago
parent fa2ab3346c
commit 2b5edfbc37

@ -85,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))

@ -101,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>();
var->GetMutable<std::vector<framework::Scope*>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {

@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}

@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>();
var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {

@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id);
#endif
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
// The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
} else {
RunImpl(scope, place);
}
VLOG(3) << place << " " << DebugStringEx(&scope);
}

@ -59,6 +59,7 @@ class VarDesc {
public:
explicit VarDesc(const std::string &name) {
desc_.set_name(name);
// TODO(paddle-dev): Why default to lodtensor.
desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
}

@ -38,8 +38,12 @@ class Variable {
template <typename T>
T* GetMutable() {
if (!IsType<T>()) {
if (!holder_) {
holder_.reset(new PlaceholderImpl<T>(new T()));
} else {
PADDLE_ENFORCE(IsType<T>(),
"Variable must be type %s, the holding type is %s",
typeid(T).name(), holder_->Type().name());
}
return static_cast<T*>(holder_->Ptr());
}

@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
const Tensor& tt = v->Get<Tensor>();
EXPECT_EQ(1234, tt.content_);
std::string* s = v->GetMutable<std::string>();
*s = "hello";
const std::string& ss = v->Get<std::string>();
EXPECT_EQ("hello", ss);
try {
v->GetMutable<std::string>();
} catch (std::exception& e) {
return;
}
EXPECT_TRUE(false);
}

@ -340,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
}
return true;
}
AnalysisPredictor::~AnalysisPredictor() {
#if !defined(_WIN32)
if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log");
}
#endif
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
auto *x = new AnalysisPredictor(config_);
x->Init(scope_, inference_program_);

@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor {
template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data);
~AnalysisPredictor();
private:
contrib::AnalysisConfig config_;

@ -42,16 +42,22 @@ class Pool2dOpConverter : public OpConverter {
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
nvinfer1::Dims input_shape = input1->getDimensions();
int nbDims = input_shape.nbDims;
nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
if (global_pooling == true) {
nvinfer1::Dims input_shape = input1->getDimensions();
int nbDims = input_shape.nbDims;
nv_ksize.d[0] = input_shape.d[nbDims - 2];
nv_ksize.d[1] = input_shape.d[nbDims - 1];
nv_strides.h() = 1;
nv_strides.w() = 1;
nv_paddings.h() = 0;
nv_paddings.w() = 0;
}
const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
@ -64,6 +70,36 @@ class Pool2dOpConverter : public OpConverter {
PADDLE_THROW("TensorRT unsupported pooling type!");
}
if (ceil_mode) {
nvinfer1::DimsHW pre_pad(0, 0);
nvinfer1::DimsHW post_pad(0, 0);
int input_height = input_shape.d[nbDims - 2];
int input_width = input_shape.d[nbDims - 1];
int floor_h_output_size =
(input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
int ceil_h_output_size =
(input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
strides[0] +
1;
int floor_w_output_size =
(input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
int ceil_w_output_size =
(input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
strides[1] +
1;
if (floor_h_output_size != ceil_h_output_size) {
post_pad.h() = strides[0] - 1;
}
if (floor_w_output_size != ceil_w_output_size) {
post_pad.w() = strides[1] - 1;
}
auto* layer = TRT_ENGINE_ADD_LAYER(
engine_, Padding, *const_cast<nvinfer1::ITensor*>(input1), pre_pad,
post_pad);
input1 = layer->getOutput(0);
}
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
*const_cast<nvinfer1::ITensor*>(input1),
nv_pool_type, nv_ksize);

@ -20,18 +20,20 @@ namespace paddle {
namespace inference {
namespace tensorrt {
void test_pool2d(bool global_pooling) {
void test_pool2d(bool global_pooling, bool ceil_mode) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(5, parameters, scope, 1 << 15);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 13, 14));
if (global_pooling)
validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
else if (ceil_mode)
validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 7));
else
validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 6));
// Prepare Op description
framework::OpDesc desc;
@ -39,7 +41,7 @@ void test_pool2d(bool global_pooling) {
desc.SetInput("X", {"pool2d-X"});
desc.SetOutput("Out", {"pool2d-Out"});
std::vector<int> ksize({2, 2});
std::vector<int> ksize({3, 3});
std::vector<int> strides({2, 2});
std::vector<int> paddings({0, 0});
std::string pooling_t = "max";
@ -49,6 +51,7 @@ void test_pool2d(bool global_pooling) {
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("global_pooling", global_pooling);
desc.SetAttr("ceil_mode", ceil_mode);
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
@ -57,9 +60,10 @@ void test_pool2d(bool global_pooling) {
validator.Execute(3);
}
TEST(Pool2dOpConverter, normal) { test_pool2d(false); }
TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); }
TEST(Pool2dOpConverter, test_ceil_mode) { test_pool2d(false, true); }
} // namespace tensorrt
} // namespace inference

@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory)
op_library(fusion_lstm_op DEPS cpu_lstm_compute)
op_library(fusion_lstm_op DEPS jit_kernel)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub)

File diff suppressed because it is too large Load Diff

@ -45,8 +45,6 @@ math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions)
# TODO(TJ): ugly workaround, clean me
cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
@ -76,3 +74,7 @@ if(WITH_GPU)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
DEPS cpu_info cblas activation_functions)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)

@ -1,43 +0,0 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
namespace paddle {
namespace operators {
namespace math {
#ifdef __AVX__
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht) {
namespace act = detail::forward::avx;
// gates: W_ch, W_ih, W_fh, W_oh
__m256 c, i, f, o;
c = _mm256_loadu_ps(gates);
i = _mm256_loadu_ps(gates + 8);
f = _mm256_loadu_ps(gates + 16);
o = _mm256_loadu_ps(gates + 24);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
i = _mm256_loadu_ps(ct_1);
f = _mm256_mul_ps(i, act::Sigmoid(f));
f = _mm256_add_ps(c, f);
_mm256_storeu_ps(ct, f);
/* H_t = act_cell(C_t) * ogated */
o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
_mm256_storeu_ps(ht, o);
}
#endif
} // namespace math
} // namespace operators
} // namespace paddle

@ -1,64 +0,0 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
// TODO(TJ): ugly workaround, clean me
template <typename T>
void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
// gates: W_ch, W_ih, W_fh, W_oh
vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
vec_tanh<T, platform::jit::avx>(8, gates, gates);
const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int d = 0; d < 8; ++d) {
// C_t = C_t-1 * fgated + cand_gated * igated
ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
// H_t = act_cell(C_t) * ogated
T tmp = ct[d] * 2;
tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
vec_exp<T>(1, &tmp, &tmp);
tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
ht[d] = tmp * o[d];
}
}
#ifdef __AVX__
namespace detail {
namespace forward {
namespace avx {
__m256 Sigmoid(const __m256 a);
__m256 Tanh(const __m256 a);
} // namespace avx
} // namespace forward
} // namespace detail
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht);
#endif
} // namespace math
} // namespace operators
} // namespace paddle

@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
}
template <>
inline void vec_scal<float, platform::jit::avx512_common>(const int n,
const float a,
const float* x,
float* y) {
inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a,
const float* x, float* y) {
// TODO(TJ): enable me
vec_scal<float, platform::jit::avx2>(n, a, x, y);
}
@ -181,10 +179,10 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
}
template <>
inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
const float a,
const float* x,
float* y) {
inline void vec_bias_sub<float, platform::jit::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
}
@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
}
template <>
inline void vec_cross<float, platform::jit::avx512_common>(
inline void vec_cross<float, platform::jit::avx512f>(
const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
@ -296,10 +294,10 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
}
template <>
inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
const float a,
const float* x,
float* y) {
inline void vec_add_bias<float, platform::jit::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
}
@ -390,9 +388,9 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
}
template <>
inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n,
const float* x,
float* y) {
inline void vec_sigmoid<float, platform::jit::avx512f>(const int n,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_sigmoid<float, platform::jit::avx2>(n, x, y);
}
@ -454,9 +452,8 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
}
template <>
inline void vec_relu<float, platform::jit::avx512_common>(const int n,
const float* x,
float* y) {
inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x,
float* y) {
// TODO(TJ): enable me
vec_relu<float, platform::jit::avx2>(n, x, y);
}

@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512_common>,
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>,
ref_sigmoid<float>);
}
TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx512_common>,
ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
}
TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
}
@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx512_common>,
ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
}
TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
}
@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx512_common>,
TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>,
ref_sigmoid<float>);
}
TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx512_common>,
ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
}
TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
}
@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx512_common>,
ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
}
TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
}

@ -0,0 +1,41 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <iostream>
#include <string>
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;
KernelPool& KernelPool::Instance() {
static thread_local KernelPool g_jit_kernels;
return g_jit_kernels;
}
std::shared_ptr<const Kernel> KernelPool::Get(const std::string& key) const {
if (kers_.find(key) == kers_.end()) {
return nullptr;
}
return kers_.at(key);
}
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle

@ -0,0 +1,142 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <memory> // for shared_ptr
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/macros.h"
// Note: Only support on CPU yet.
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define AVX_FLOAT_BLOCK 8
#define AVX2_FLOAT_BLOCK 8
#define AVX512_FLOAT_BLOCK 16
typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
class Kernel {
public:
Kernel() = default;
virtual ~Kernel() = default;
int num_{0};
int end_{0};
int rest_{0};
DISABLE_COPY_AND_ASSIGN(Kernel);
};
class KernelPool {
public:
static KernelPool &Instance();
template <typename Ker, typename... ARGS>
std::shared_ptr<const Ker> Get(ARGS... args);
std::shared_ptr<const Kernel> Get(const std::string &key) const;
private:
KernelPool() = default;
std::unordered_map<std::string, std::shared_ptr<const Kernel>> kers_;
DISABLE_COPY_AND_ASSIGN(KernelPool);
};
template <typename T>
class VMulKernel : public Kernel {
public:
virtual void Compute(const T *x, const T *y, T *z) const = 0;
};
template <typename T>
class VAddKernel : public Kernel {
public:
virtual void Compute(const T *x, const T *y, T *z) const = 0;
};
template <typename T>
class VScalKernel : public Kernel {
public:
virtual void Compute(const T a, const T *x, T *y) const = 0;
virtual void Compute(const T a, T *x) const = 0;
};
template <typename T>
class VAddBiasKernel : public Kernel {
public:
virtual void Compute(const T a, const T *x, T *y) const = 0;
};
template <typename T>
class VActKernel : public Kernel {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VReluKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VIdentityKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VExpKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VSigmoidKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class VTanhKernel : public VActKernel<T> {
public:
virtual void Compute(const T *x, T *y) const = 0;
};
template <typename T>
class LSTMKernel : public Kernel {
public:
virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
/* below only used in peephole*/
const T *wp_data = nullptr,
T *checked = nullptr) const = 0;
// compute c1 and h1 without c0 or h0
virtual void ComputeC1H1(T *gates, T *ct, T *ht,
/* below only used in peephole*/
const T *wp_data = nullptr) const = 0;
};
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,111 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;
#define SEARCH_BLOCK(macro_, ker, dtype, isa) \
if (d < AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kLT8); \
} else if (d == AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ8); \
} else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kGT8LT16); \
} else if (d == AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ16); \
} else { \
macro_(ker, dtype, isa, kGT16); \
}
#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \
if (jit::MayIUse(jit::avx512f)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
} else if (jit::MayIUse(jit::avx2)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \
} else if (jit::MayIUse(jit::avx)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \
} else { \
SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
}
#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const ker_class<ker_dtype>> \
KernelPool::Get<ker_class<ker_dtype>, int>(int d)
#define JITKERNEL_KEY(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d)
#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(d))
#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
marco_declare, macro_key, macro_impl) \
marco_declare(ker_class, ker_dtype) { \
std::string key = macro_key(ker_key, dtype_key); \
if (kers_.find(key) == kers_.end()) { \
std::shared_ptr<ker_class<ker_dtype>> p; \
SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \
kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)}); \
return p; \
} \
return std::dynamic_pointer_cast<const ker_class<ker_dtype>>( \
kers_.at(key)); \
}
#define REGISTER_JITKERNEL(ker_key, ker_class) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \
macro_impl) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
macro_impl); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \
macro_key, macro_impl)
#define FOR_EACH_ISA(macro_, block) \
macro_(jit::avx512f, block); \
macro_(jit::avx2, block); \
macro_(jit::avx, block); \
macro_(jit::isa_any, block)
#define FOR_EACH_BLOCK(macro_, isa) \
macro_(isa, kLT8); \
macro_(isa, kEQ8); \
macro_(isa, kGT8LT16); \
macro_(isa, kEQ16); \
macro_(isa, kGT16)
#define FOR_EACH_ISA_BLOCK(macro_) \
FOR_EACH_BLOCK(macro_, jit::avx512f); \
FOR_EACH_BLOCK(macro_, jit::avx2); \
FOR_EACH_BLOCK(macro_, jit::avx); \
FOR_EACH_BLOCK(macro_, jit::isa_any)
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save