You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
326 lines
12 KiB
326 lines
12 KiB
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# recurrent_units.py
|
|
# Version 2.0
|
|
#
|
|
# Some recurrent units can be used in recurrent layer group,
|
|
# to use these units, import this module in your config_file:
|
|
# import trainer.recurrent_units
|
|
#
|
|
|
|
from paddle.trainer.config_parser import *
|
|
|
|
# long short term memory, can be used in recurrent machine
|
|
# *inputs* must be a list of Projections, for example:
|
|
# inputs = [FullMatrixProjection("input_layer_name")],
|
|
# *para_prefix* defines parameter names, if the *para_prefix* of
|
|
# two LstmRecurrentUnit is same, they share same parameters
|
|
# *out_memory* can be defined outside if it's used outside
|
|
def LstmRecurrentUnit(name, size,
|
|
active_type, state_active_type, gate_active_type,
|
|
inputs, para_prefix = None,
|
|
error_clipping_threshold = 0,
|
|
out_memory = None):
|
|
|
|
if para_prefix is None:
|
|
para_prefix = name
|
|
if out_memory is None:
|
|
out_memory = Memory(name = name, size = size)
|
|
|
|
state_memory = Memory(name = name + "_" + "state", size = size)
|
|
|
|
Layer(
|
|
name = name + "_" + "input_recurrent",
|
|
type = "mixed",
|
|
size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
|
|
error_clipping_threshold = error_clipping_threshold,
|
|
bias = Bias(initial_std = 0,
|
|
parameter_name = para_prefix + "_input_recurrent.b"),
|
|
inputs = inputs + [
|
|
FullMatrixProjection(out_memory,
|
|
parameter_name = para_prefix + "_input_recurrent.w"),
|
|
],
|
|
)
|
|
LstmStepLayer(
|
|
name = name,
|
|
size = size,
|
|
bias = Bias(parameter_name = para_prefix + "_check.b"),
|
|
inputs = [name + "_" + "input_recurrent", state_memory],
|
|
active_type = active_type,
|
|
active_gate_type = gate_active_type,
|
|
active_state_type = state_active_type,
|
|
)
|
|
GetOutputLayer(
|
|
name = name + "_" + "state",
|
|
size = size,
|
|
inputs = Input(name, input_layer_argument = "state"),
|
|
)
|
|
|
|
def LstmRecurrentUnitNaive(name, size,
|
|
active_type, state_active_type, gate_active_type,
|
|
inputs, para_prefix = None,
|
|
error_clipping_threshold = 0,
|
|
out_memory = None):
|
|
|
|
if para_prefix is None:
|
|
para_prefix = name
|
|
if out_memory is None:
|
|
out_memory = Memory(name = name, size = size)
|
|
|
|
state_memory = Memory(name = name + "_" + "state", size = size)
|
|
|
|
Layer(
|
|
name = name + "_" + "input_recurrent",
|
|
type = "mixed",
|
|
size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
|
|
error_clipping_threshold = error_clipping_threshold,
|
|
bias = Bias(initial_std = 0,
|
|
parameter_name = para_prefix + "_input_recurrent.b"),
|
|
inputs = inputs + [
|
|
FullMatrixProjection(out_memory,
|
|
parameter_name = para_prefix + "_input_recurrent.w"),
|
|
],
|
|
)
|
|
ExpressionLayer(
|
|
name = name + "_" + "input_s",
|
|
size = size,
|
|
active_type = active_type,
|
|
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=0)],
|
|
)
|
|
ExpressionLayer(
|
|
name = name + "_" + "input_gate",
|
|
active_type = gate_active_type,
|
|
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size),
|
|
DotMulProjection(state_memory,
|
|
parameter_name = para_prefix + "_input_check.w")],
|
|
)
|
|
ExpressionLayer(
|
|
name = name + "_" + "forget_gate",
|
|
active_type = gate_active_type,
|
|
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*2),
|
|
DotMulProjection(state_memory,
|
|
parameter_name = para_prefix + "_forget_check.w")],
|
|
)
|
|
ExpressionLayer(
|
|
name = name + "_" + "state",
|
|
inputs = [DotMulOperator([name + "_" + "input_s",
|
|
name + "_" + "input_gate"]),
|
|
DotMulOperator([state_memory,
|
|
name + "_" + "forget_gate"]),
|
|
],
|
|
)
|
|
ExpressionLayer(
|
|
name = name + "_" + "output_gate",
|
|
active_type = gate_active_type,
|
|
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*3),
|
|
DotMulProjection(name + "_" + "state",
|
|
parameter_name = para_prefix + "_output_check.w")],
|
|
)
|
|
ExpressionLayer(
|
|
name = name + "_" + "state_atv",
|
|
active_type = state_active_type,
|
|
inputs = IdentityProjection(name + "_" + "state"),
|
|
)
|
|
ExpressionLayer(
|
|
name = name,
|
|
inputs = DotMulOperator([name + "_" + "state_atv",
|
|
name + "_" + "output_gate"]),
|
|
)
|
|
|
|
# like LstmRecurrentUnit, but it's a layer group.
|
|
# it is equivalent to LstmLayer
|
|
def LstmRecurrentLayerGroup(name, size,
|
|
active_type, state_active_type, gate_active_type,
|
|
inputs, para_prefix = None,
|
|
error_clipping_threshold = 0,
|
|
seq_reversed = False):
|
|
|
|
input_layer_name = name + "_" + "transform_input"
|
|
Layer(
|
|
name = input_layer_name,
|
|
type = "mixed",
|
|
size = size * 4,
|
|
active_type = "",
|
|
bias = False,
|
|
inputs = inputs,
|
|
)
|
|
|
|
RecurrentLayerGroupBegin(name + "_layer_group",
|
|
in_links = [input_layer_name],
|
|
out_links = [name],
|
|
seq_reversed = seq_reversed)
|
|
|
|
LstmRecurrentUnit(
|
|
name = name,
|
|
size = size,
|
|
active_type = active_type,
|
|
state_active_type = state_active_type,
|
|
gate_active_type = gate_active_type,
|
|
inputs = [IdentityProjection(input_layer_name)],
|
|
para_prefix = para_prefix,
|
|
error_clipping_threshold = error_clipping_threshold,
|
|
)
|
|
|
|
RecurrentLayerGroupEnd(name + "_layer_group")
|
|
|
|
|
|
|
|
# gated recurrent unit, can be used in recurrent machine
|
|
# *inputs* should be a list of Projections, for example:
|
|
# inputs = [FullMatrixProjection("input_layer_name")],
|
|
# *para_prefix* defines parameter names, if the *para_prefix* of
|
|
# two GatedRecurrentUnit is same, they share same parameters
|
|
# *out_memory* can be defined outside if it's used outside
|
|
|
|
def GatedRecurrentUnit(name, size,
|
|
active_type, gate_active_type,
|
|
inputs, para_prefix = None,
|
|
error_clipping_threshold = 0,
|
|
out_memory = None):
|
|
if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
|
|
input_layer_name = inputs
|
|
else:
|
|
input_layer_name = name + "_" + "transform_input"
|
|
Layer(
|
|
name = input_layer_name,
|
|
type = "mixed",
|
|
size = size * 3,
|
|
active_type = "",
|
|
bias = False,
|
|
inputs = inputs,
|
|
)
|
|
|
|
if para_prefix is None:
|
|
para_prefix = name
|
|
if out_memory is None:
|
|
out_memory = Memory(name = name, size = size)
|
|
|
|
GruStepLayer(
|
|
name = name,
|
|
size = size,
|
|
bias = Bias(parameter_name = para_prefix + "_gate.b"),
|
|
inputs = [input_layer_name,
|
|
Input(out_memory, parameter_name = para_prefix + "_gate.w")],
|
|
active_type = active_type,
|
|
active_gate_type = gate_active_type,
|
|
)
|
|
|
|
def GatedRecurrentUnitNaive(name, size,
|
|
active_type, gate_active_type,
|
|
inputs, para_prefix = None,
|
|
error_clipping_threshold = 0,
|
|
out_memory = None):
|
|
|
|
if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
|
|
input_layer_name = inputs
|
|
else:
|
|
input_layer_name = name + "_" + "transform_input"
|
|
Layer(
|
|
name = input_layer_name,
|
|
type = "mixed",
|
|
size = size * 3,
|
|
active_type = "",
|
|
bias = False,
|
|
inputs = inputs,
|
|
)
|
|
|
|
if para_prefix is None:
|
|
para_prefix = name
|
|
if out_memory is None:
|
|
out_memory = Memory(name = name, size = size)
|
|
|
|
Layer(
|
|
name = name + "_" + "update_gate",
|
|
type = "mixed",
|
|
size = size,
|
|
active_type = gate_active_type,
|
|
error_clipping_threshold = error_clipping_threshold,
|
|
bias = Bias(initial_std = 0, parameter_name = para_prefix + "_update_gate.b"),
|
|
inputs = [IdentityOffsetProjection(input_layer_name, offset=0),
|
|
FullMatrixProjection(out_memory,
|
|
parameter_name = para_prefix + "_update_gate.w")],
|
|
)
|
|
Layer(
|
|
name = name + "_" + "reset_gate",
|
|
type = "mixed",
|
|
size = size,
|
|
active_type = gate_active_type,
|
|
error_clipping_threshold = error_clipping_threshold,
|
|
bias = Bias(initial_std = 0, parameter_name = para_prefix + "_reset_gate.b"),
|
|
inputs = [IdentityOffsetProjection(input_layer_name, offset=size),
|
|
FullMatrixProjection(out_memory,
|
|
parameter_name = para_prefix + "_reset_gate.w")],
|
|
)
|
|
ExpressionLayer(
|
|
name = name + "_" + "reset_output",
|
|
inputs = DotMulOperator([out_memory, name + "_" + "reset_gate"]),
|
|
)
|
|
Layer(
|
|
name = name + "_" + "output_candidate",
|
|
type = "mixed",
|
|
size = size,
|
|
active_type = active_type,
|
|
error_clipping_threshold = error_clipping_threshold,
|
|
bias = Bias(initial_std = 0, parameter_name = para_prefix + "_output_candidate.b"),
|
|
inputs = [IdentityOffsetProjection(input_layer_name, offset=size*2),
|
|
FullMatrixProjection(name + "_" + "reset_output",
|
|
parameter_name = para_prefix + "_output_candidate.w")],
|
|
)
|
|
ExpressionLayer( #element-wise interpolation
|
|
name = name,
|
|
inputs = [IdentityProjection(out_memory),
|
|
DotMulOperator([out_memory,
|
|
name + "_" + "update_gate"], scale=-1.0),
|
|
DotMulOperator([name + "_" + "output_candidate",
|
|
name + "_" + "update_gate"]),
|
|
],
|
|
)
|
|
|
|
# like GatedRecurrentUnit, but it's a layer group.
|
|
# it is equivalent to GatedRecurrentLayer.
|
|
def GatedRecurrentLayerGroup(name, size,
|
|
active_type, gate_active_type,
|
|
inputs, para_prefix = None,
|
|
error_clipping_threshold = 0,
|
|
seq_reversed = False):
|
|
|
|
input_layer_name = name + "_" + "transform_input"
|
|
Layer(
|
|
name = input_layer_name,
|
|
type = "mixed",
|
|
size = size * 3,
|
|
active_type = "",
|
|
bias = False,
|
|
inputs = inputs,
|
|
)
|
|
|
|
RecurrentLayerGroupBegin(name + "_layer_group",
|
|
in_links = [input_layer_name],
|
|
out_links = [name],
|
|
seq_reversed = seq_reversed)
|
|
|
|
GatedRecurrentUnit(
|
|
name = name,
|
|
size = size,
|
|
active_type = active_type,
|
|
gate_active_type = gate_active_type,
|
|
inputs = input_layer_name, #transform outside
|
|
para_prefix = para_prefix,
|
|
error_clipping_threshold = error_clipping_threshold,
|
|
)
|
|
|
|
RecurrentLayerGroupEnd(name + "_layer_group")
|
|
|