Merge pull request #12878 from tensor-tang/feature/op/attention_lstm
Add attention lstm cpu forwardrevert-12864-feature/process_lod_grad
commit
f0f06992c1
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,41 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
using LoDTensor = framework::LoDTensor;
|
||||
using Tensor = framework::Tensor;
|
||||
|
||||
class AttentionLSTMOp : public framework::OperatorWithKernel {
|
||||
public:
|
||||
using framework::OperatorWithKernel::OperatorWithKernel;
|
||||
|
||||
void InferShape(framework::InferShapeContext* ctx) const override;
|
||||
|
||||
protected:
|
||||
framework::OpKernelType GetExpectedKernelType(
|
||||
const framework::ExecutionContext& ctx) const override;
|
||||
};
|
||||
|
||||
class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
void Make() override;
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
@ -0,0 +1,105 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include "paddle/fluid/platform/cpu_info.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
namespace math {
|
||||
|
||||
#define SIGMOID_THRESHOLD_MIN -40.0
|
||||
#define SIGMOID_THRESHOLD_MAX 13.0
|
||||
#define EXP_MAX_INPUT 40.0
|
||||
|
||||
template <typename T>
|
||||
inline T sigmoid(T x) {
|
||||
return 1. / (1. + exp(-x));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T tanh(T x) {
|
||||
return 2. * sigmoid(2. * x) - 1.;
|
||||
}
|
||||
|
||||
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
|
||||
inline void vec_identity(const int n, const T* x, T* y) {
|
||||
// do nothing
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
|
||||
inline void vec_sigmoid(const int n, const T* x, T* y) {
|
||||
const T min = SIGMOID_THRESHOLD_MIN;
|
||||
const T max = SIGMOID_THRESHOLD_MAX;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
|
||||
y[i] = 1.0 / (1.0 + std::exp(-tmp));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
|
||||
inline void vec_tanh(const int n, const T* x, T* y) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = tanh<T>(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
|
||||
inline void vec_relu(const int n, const T* x, T* y) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i] > 0 ? x[i] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
|
||||
float* y) {
|
||||
// TODO(TJ): complete me
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i] > 0 ? x[i] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
|
||||
float* y) {
|
||||
// TODO(TJ): complete me
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i] > 0 ? x[i] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
|
||||
class VecActivations {
|
||||
public:
|
||||
std::function<void(const int, const T*, T*)> operator()(
|
||||
const std::string& type) {
|
||||
if (type == "sigmoid") {
|
||||
return vec_sigmoid<T, isa>;
|
||||
} else if (type == "relu") {
|
||||
return vec_relu<T, isa>;
|
||||
} else if (type == "tanh") {
|
||||
return vec_tanh<T, isa>;
|
||||
} else if (type == "identity" || type == "") {
|
||||
return vec_identity<T, isa>;
|
||||
}
|
||||
PADDLE_THROW("Not support type %s.", type);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace math
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
@ -0,0 +1,208 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
from op_test import OpTest
|
||||
from test_fusion_lstm_op import fc, ACTIVATION
|
||||
from test_softmax_op import stable_softmax
|
||||
|
||||
|
||||
def attention_lstm(
|
||||
x, # T x M
|
||||
lod, # 1 x N
|
||||
h0, # N x D
|
||||
c0, # N x D
|
||||
fcws, # (M+D) x 1, 1x1
|
||||
fcbs, # 1 x 1, 1x1
|
||||
w, # (M+D) x 4D
|
||||
b, # 1 x 4D
|
||||
act_gate,
|
||||
act_cell,
|
||||
act_cand):
|
||||
|
||||
T = sum(lod[0])
|
||||
N = len(lod[0])
|
||||
M = x.shape[1]
|
||||
D = b.shape[1] / 4
|
||||
assert T == x.shape[0]
|
||||
assert len(fcws) == len(fcbs)
|
||||
hidden = []
|
||||
cell = []
|
||||
|
||||
start_offset = 0
|
||||
for bid in range(N):
|
||||
seq_len = lod[0][bid]
|
||||
xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len,
|
||||
M)
|
||||
prev_cell = np.copy(c0[bid]).reshape([1, D])
|
||||
prev_hidden = np.copy(h0[bid]).reshape([1, D])
|
||||
for step in range(seq_len):
|
||||
expanded_cell = np.repeat(prev_cell, seq_len, axis=0)
|
||||
tmp = np.concatenate((xi, expanded_cell), axis=1)
|
||||
assert tmp.shape[0] == seq_len
|
||||
assert tmp.shape[1] == M + D
|
||||
for fcid in range(len(fcbs)):
|
||||
tmp = fc(tmp, fcws[fcid], fcbs[fcid])
|
||||
tmp = ACTIVATION['relu'](tmp)
|
||||
tmp = np.reshape(tmp, (1, seq_len))
|
||||
tmp = stable_softmax(tmp).reshape(seq_len, 1)
|
||||
lstmx = xi * tmp # seq * M
|
||||
lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M])
|
||||
lstmin = np.concatenate((prev_hidden, lstmx), axis=1)
|
||||
lstmout = fc(lstmin, w, b).reshape([1, 4 * D])
|
||||
|
||||
g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1)
|
||||
g_f = act_gate(g_f).reshape([1, D])
|
||||
g_i = act_gate(g_i).reshape([1, D])
|
||||
g_o = act_gate(g_o).reshape([1, D])
|
||||
cand = act_cand(cand).reshape([1, D])
|
||||
|
||||
cell_t = (prev_cell * g_f) + (g_i * cand)
|
||||
hidden_t = g_o * act_cell(cell_t)
|
||||
|
||||
hidden.append(hidden_t.flatten())
|
||||
cell.append(cell_t.flatten())
|
||||
|
||||
prev_cell = cell_t.reshape([1, D])
|
||||
prev_hidden = hidden_t.reshape([1, D])
|
||||
|
||||
start_offset += seq_len
|
||||
|
||||
hidden = np.array(hidden).astype('float32').reshape([T, D])
|
||||
cell = np.array(cell).astype('float32').reshape([T, D])
|
||||
return hidden, cell
|
||||
|
||||
|
||||
class TestAttentionLSTMOp(OpTest):
|
||||
def set_conf(self):
|
||||
pass
|
||||
|
||||
def setUp(self):
|
||||
self.op_type = 'attention_lstm'
|
||||
self.lod = [[3]]
|
||||
self.M = 30
|
||||
self.D = 15
|
||||
self.has_initial_hidden = True
|
||||
self.act_gate = 'sigmoid'
|
||||
self.act_cell = 'tanh'
|
||||
self.act_cand = 'tanh'
|
||||
self.set_conf()
|
||||
|
||||
T = sum(self.lod[0])
|
||||
bs = len(self.lod[0])
|
||||
|
||||
x = np.random.normal(size=(T, self.M)).astype('float32')
|
||||
c0 = np.random.normal(size=(bs, self.D)).astype('float32')
|
||||
if self.has_initial_hidden:
|
||||
h0 = np.random.normal(size=(bs, self.D)).astype('float32')
|
||||
else:
|
||||
h0 = np.zeros((bs, self.D)).astype('float32')
|
||||
|
||||
fcw1 = np.random.normal(size=(self.M + self.D, 1)).astype('float32')
|
||||
fcb1 = np.random.normal(size=(1, 1)).astype('float32')
|
||||
fcw2 = np.random.normal(size=(1, 1)).astype('float32')
|
||||
fcb2 = np.random.normal(size=(1, 1)).astype('float32')
|
||||
|
||||
# lstm weight and bias
|
||||
w = np.random.normal(size=(self.M + self.D,
|
||||
self.D * 4)).astype('float32')
|
||||
b = np.random.normal(size=(1, self.D * 4)).astype('float32')
|
||||
|
||||
h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2],
|
||||
w, b, ACTIVATION[self.act_gate],
|
||||
ACTIVATION[self.act_cell],
|
||||
ACTIVATION[self.act_cand])
|
||||
|
||||
self.inputs = {
|
||||
'X': (x, self.lod),
|
||||
'C0': c0,
|
||||
'AttentionWeight': fcw1,
|
||||
'AttentionBias': fcb1,
|
||||
'AttentionScalar': fcw2,
|
||||
'AttentionScalarBias': fcb2,
|
||||
'LSTMWeight': w,
|
||||
'LSTMBias': b
|
||||
}
|
||||
|
||||
if self.has_initial_hidden:
|
||||
self.inputs['H0'] = h0
|
||||
|
||||
self.outputs = {
|
||||
'Hidden': (h, self.lod),
|
||||
'Cell': (c, self.lod),
|
||||
}
|
||||
self.attrs = {
|
||||
'gate_activation': self.act_gate,
|
||||
'cell_activation': self.act_cell,
|
||||
'candidate_activation': self.act_cand
|
||||
}
|
||||
|
||||
def test_check_output(self):
|
||||
self.check_output()
|
||||
|
||||
|
||||
class TestAttentionOpNonInit(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.has_initial_hidden = False
|
||||
|
||||
|
||||
class TestAttentionOpAct(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.M = 3
|
||||
self.D = 2
|
||||
self.act_gate = 'relu'
|
||||
self.act_cell = 'tanh'
|
||||
self.act_cand = 'sigmoid'
|
||||
|
||||
|
||||
class TestAttentionOpMD1(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.M = 36
|
||||
self.D = 8
|
||||
|
||||
|
||||
class TestAttentionOpMD2(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.M = 8
|
||||
self.D = 8
|
||||
|
||||
|
||||
class TestAttentionOpMD3(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.M = 15
|
||||
self.D = 30
|
||||
|
||||
|
||||
class TestAttentionOpBS1(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.lod = [[5]]
|
||||
self.M = 16
|
||||
self.D = 32
|
||||
|
||||
|
||||
class TestAttentionOpBS2(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.lod = [[3, 6]]
|
||||
|
||||
|
||||
class TestAttentionOpBS5(TestAttentionLSTMOp):
|
||||
def set_conf(self):
|
||||
self.lod = [[3, 2, 4, 7, 5]]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in new issue