Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)
* added external reorder to profiler * resolved conflict * added enable_static * initial version of lstm, not working yet * added lstm to operators.cmake * added vanilla lstm mkldnn op * added peephole weights integration * minor changes * added formatting * added fusion_lstm_mkldnn to static_whitelist * added formatting * removed comment * moved use_peepholes attribute inside is_cached block * reverted wrong changes * minor formatting change * minor changesrevert-31068-fix_conv3d_windows
parent
1a13626f5f
commit
d834f4e6e8
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,229 @@
|
||||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/platform/mkldnn_reuse.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
using paddle::framework::LoDTensor;
|
||||
using paddle::framework::Tensor;
|
||||
using paddle::platform::CPUDeviceContext;
|
||||
using paddle::platform::CreateKey;
|
||||
using paddle::platform::MKLDNNGetDataType;
|
||||
using paddle::platform::MKLDNNMemDesc;
|
||||
using platform::to_void_cast;
|
||||
|
||||
template <typename T, typename T_alg, typename T_out = T>
|
||||
class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
|
||||
public:
|
||||
RNNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
|
||||
const platform::MKLDNNDeviceContext& dev_ctx,
|
||||
const mkldnn::engine mkldnn_engine,
|
||||
platform::Place cpu_place, const LoDTensor* input,
|
||||
const Tensor* weight_h, const Tensor* h0,
|
||||
const bool is_reverse, const int64_t N, const int64_t Ti,
|
||||
const int64_t IC, const int64_t OC, const int64_t G,
|
||||
const std::string& unique_name)
|
||||
: platform::MKLDNNHandlerT<T, T_alg>(
|
||||
dev_ctx, dev_ctx.GetEngine(), cpu_place,
|
||||
CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>(), Ti)),
|
||||
N(N),
|
||||
Ti(Ti),
|
||||
IC(IC),
|
||||
OC(OC),
|
||||
G(G) {
|
||||
// Create memory key without Ti because weights, bias and h0 memories
|
||||
// do not depend on Ti size but primitive and input/output memory do
|
||||
memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded(
|
||||
dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>()));
|
||||
|
||||
// Is it int8 kernel
|
||||
const bool is_INT8 = std::is_same<T, uint8_t>::value;
|
||||
|
||||
if (is_INT8) {
|
||||
// Int8 attributes
|
||||
const float scale_data = ctx.Attr<float>("Scale_data");
|
||||
const float shift_data = ctx.Attr<float>("Shift_data");
|
||||
const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
|
||||
|
||||
const int weights_scale_mask =
|
||||
0 +
|
||||
(1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo`
|
||||
+
|
||||
(1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo`
|
||||
|
||||
attr_.set_rnn_data_qparams(scale_data, shift_data);
|
||||
attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
|
||||
}
|
||||
}
|
||||
|
||||
bool is_NTC() {
|
||||
return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
|
||||
dnnl::memory::format_tag::ntc);
|
||||
}
|
||||
|
||||
void reorderRNNdata(void* input_data, void* output_data,
|
||||
std::vector<size_t> lod, const bool is_reverse,
|
||||
platform::RNNReorderType reorder_type) {
|
||||
switch (reorder_type) {
|
||||
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
|
||||
case platform::RNNReorderType::PP_NTC: {
|
||||
auto* input_data_iter = reinterpret_cast<T*>(input_data);
|
||||
auto* output_data_iter = reinterpret_cast<T*>(output_data);
|
||||
for (int n = 0; n < N; ++n) {
|
||||
const auto num_elements = (lod[n + 1] - lod[n]) * IC;
|
||||
const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
|
||||
memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
|
||||
sizeof(T) * num_elements);
|
||||
input_data_iter += num_elements;
|
||||
}
|
||||
} break;
|
||||
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
|
||||
case platform::RNNReorderType::PP_TNC: {
|
||||
auto* input_data_iter = reinterpret_cast<T*>(input_data);
|
||||
auto* output_data_iter = reinterpret_cast<T*>(output_data);
|
||||
for (int n = 0; n < N; ++n) {
|
||||
const auto num_elements = (lod[n + 1] - lod[n]);
|
||||
const auto offset = is_reverse ? (Ti - num_elements) : 0;
|
||||
for (size_t t = 0; t < num_elements; ++t) {
|
||||
memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
|
||||
input_data_iter, sizeof(T) * IC);
|
||||
input_data_iter += IC;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
|
||||
case platform::RNNReorderType::NTC_PP: {
|
||||
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
|
||||
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
|
||||
for (int n = 0; n < N; ++n) {
|
||||
const auto num_elements = (lod[n + 1] - lod[n]) * OC;
|
||||
const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
|
||||
memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
|
||||
sizeof(T_out) * num_elements);
|
||||
output_data_iter += num_elements;
|
||||
}
|
||||
} break;
|
||||
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
|
||||
case platform::RNNReorderType::TNC_PP: {
|
||||
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
|
||||
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
|
||||
for (int n = 0; n < N; ++n) {
|
||||
const auto num_elements = lod[n + 1] - lod[n];
|
||||
const auto offset = is_reverse ? (Ti - num_elements) : 0;
|
||||
for (size_t t = 0; t < num_elements; ++t) {
|
||||
memcpy(output_data_iter,
|
||||
input_data_iter + (t + offset) * N * OC + n * OC,
|
||||
sizeof(T_out) * OC);
|
||||
output_data_iter += OC;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
|
||||
const LoDTensor* input, const bool is_reverse) {
|
||||
const auto name = this->key_ + "@input_mem";
|
||||
auto memory_p =
|
||||
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
|
||||
|
||||
if (!memory_p) {
|
||||
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
|
||||
this->engine_);
|
||||
this->dev_ctx_.SetBlob(name, memory_p);
|
||||
}
|
||||
|
||||
const auto& input_lod = input->lod()[0];
|
||||
auto* x_data = to_void_cast(input->data<T>());
|
||||
|
||||
auto* x_onednn_data = memory_p->get_data_handle();
|
||||
memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
|
||||
|
||||
if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
|
||||
dnnl::memory::format_tag::ntc) {
|
||||
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
|
||||
platform::RNNReorderType::PP_NTC);
|
||||
} else {
|
||||
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
|
||||
platform::RNNReorderType::PP_TNC);
|
||||
}
|
||||
return memory_p;
|
||||
}
|
||||
|
||||
std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
|
||||
const auto name = this->key_ + "@output_mem";
|
||||
auto memory_p =
|
||||
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
|
||||
|
||||
if (!memory_p) {
|
||||
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
|
||||
this->engine_);
|
||||
this->dev_ctx_.SetBlob(name, memory_p);
|
||||
}
|
||||
return memory_p;
|
||||
}
|
||||
|
||||
// TODO(grygielski) H0 is for now persistable
|
||||
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
|
||||
// not support in yet)
|
||||
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
|
||||
const std::string h0_key = memory_key_ + "@h0";
|
||||
auto memory_p =
|
||||
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
|
||||
|
||||
if (!memory_p) {
|
||||
auto user_h0_memory = dnnl::memory();
|
||||
if (h0) {
|
||||
user_h0_memory =
|
||||
dnnl::memory({{1, 1, N, OC},
|
||||
MKLDNNGetDataType<float>(),
|
||||
MKLDNNMemoryFormat::ldnc},
|
||||
this->engine_, to_void_cast(h0->data<float>()));
|
||||
} else {
|
||||
user_h0_memory = dnnl::memory({{1, 1, N, OC},
|
||||
MKLDNNGetDataType<float>(),
|
||||
MKLDNNMemoryFormat::ldnc},
|
||||
this->engine_);
|
||||
memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
|
||||
}
|
||||
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
|
||||
this->engine_);
|
||||
|
||||
dnnl::stream astream(this->engine_);
|
||||
dnnl::reorder(user_h0_memory, *memory_p, attr_)
|
||||
.execute(astream, user_h0_memory, *memory_p);
|
||||
|
||||
this->dev_ctx_.SetBlob(h0_key, memory_p);
|
||||
}
|
||||
return memory_p;
|
||||
}
|
||||
|
||||
protected:
|
||||
// RNN dimensions
|
||||
// N - Batch Size
|
||||
// Ti - Max sentence length
|
||||
// IC - Input Channels
|
||||
// OC - Output Channels
|
||||
// G - Number of gates
|
||||
const int64_t N, Ti, IC, OC, G;
|
||||
|
||||
// Memory size of weights, bias and h0 does not depend
|
||||
// on Ti size, thus we need another key to cache them
|
||||
std::string memory_key_;
|
||||
dnnl::primitive_attr attr_;
|
||||
};
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
@ -0,0 +1,81 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
from paddle.fluid.tests.unittests.test_fusion_lstm_op import TestFusionLSTMOp
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOp(TestFusionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.use_mkldnn = True
|
||||
|
||||
def test_check_output(self):
|
||||
for use_seq in {True, False}:
|
||||
self.attrs['use_seq'] = use_seq
|
||||
self.check_output(check_dygraph=False, no_check_set=["Cell"])
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOpReverse(TestFusionLSTMONEDNNOp):
|
||||
def set_conf(self):
|
||||
self.is_reverse = True
|
||||
self.use_mkldnn = True
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOpInitReverse(TestFusionLSTMONEDNNOp):
|
||||
def set_conf(self):
|
||||
self.has_initial_state = True
|
||||
self.is_reverse = True
|
||||
self.use_mkldnn = True
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOpMD1(TestFusionLSTMONEDNNOp):
|
||||
def set_conf(self):
|
||||
self.M = 36
|
||||
self.D = 8
|
||||
self.use_mkldnn = True
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOpMD2(TestFusionLSTMONEDNNOp):
|
||||
def set_conf(self):
|
||||
self.M = 8
|
||||
self.D = 8
|
||||
self.use_mkldnn = True
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOpMD3(TestFusionLSTMONEDNNOp):
|
||||
def set_conf(self):
|
||||
self.M = 15
|
||||
self.D = 3
|
||||
self.use_mkldnn = True
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOpBS1(TestFusionLSTMONEDNNOp):
|
||||
def set_conf(self):
|
||||
self.lod = [[3]]
|
||||
self.D = 16
|
||||
self.use_mkldnn = True
|
||||
|
||||
|
||||
class TestFusionLSTMONEDNNOpPeepholesInit(TestFusionLSTMONEDNNOp):
|
||||
def set_conf(self):
|
||||
self.use_peepholes = True
|
||||
self.has_initial_state = True
|
||||
self.use_mkldnn = True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from paddle import enable_static
|
||||
enable_static()
|
||||
unittest.main()
|
Loading…
Reference in new issue