You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/trainer/recurrent_units.py

326 lines
12 KiB

# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# recurrent_units.py
# Version 2.0
#
# Some recurrent units can be used in recurrent layer group,
# to use these units, import this module in your config_file:
# import trainer.recurrent_units
#
from paddle.trainer.config_parser import *
# long short term memory, can be used in recurrent machine
# *inputs* must be a list of Projections, for example:
# inputs = [FullMatrixProjection("input_layer_name")],
# *para_prefix* defines parameter names, if the *para_prefix* of
# two LstmRecurrentUnit is same, they share same parameters
# *out_memory* can be defined outside if it's used outside
def LstmRecurrentUnit(name, size,
active_type, state_active_type, gate_active_type,
inputs, para_prefix = None,
error_clipping_threshold = 0,
out_memory = None):
if para_prefix is None:
para_prefix = name
if out_memory is None:
out_memory = Memory(name = name, size = size)
state_memory = Memory(name = name + "_" + "state", size = size)
Layer(
name = name + "_" + "input_recurrent",
type = "mixed",
size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
error_clipping_threshold = error_clipping_threshold,
bias = Bias(initial_std = 0,
parameter_name = para_prefix + "_input_recurrent.b"),
inputs = inputs + [
FullMatrixProjection(out_memory,
parameter_name = para_prefix + "_input_recurrent.w"),
],
)
LstmStepLayer(
name = name,
size = size,
bias = Bias(parameter_name = para_prefix + "_check.b"),
inputs = [name + "_" + "input_recurrent", state_memory],
active_type = active_type,
active_gate_type = gate_active_type,
active_state_type = state_active_type,
)
GetOutputLayer(
name = name + "_" + "state",
size = size,
inputs = Input(name, input_layer_argument = "state"),
)
def LstmRecurrentUnitNaive(name, size,
active_type, state_active_type, gate_active_type,
inputs, para_prefix = None,
error_clipping_threshold = 0,
out_memory = None):
if para_prefix is None:
para_prefix = name
if out_memory is None:
out_memory = Memory(name = name, size = size)
state_memory = Memory(name = name + "_" + "state", size = size)
Layer(
name = name + "_" + "input_recurrent",
type = "mixed",
size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
error_clipping_threshold = error_clipping_threshold,
bias = Bias(initial_std = 0,
parameter_name = para_prefix + "_input_recurrent.b"),
inputs = inputs + [
FullMatrixProjection(out_memory,
parameter_name = para_prefix + "_input_recurrent.w"),
],
)
ExpressionLayer(
name = name + "_" + "input_s",
size = size,
active_type = active_type,
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=0)],
)
ExpressionLayer(
name = name + "_" + "input_gate",
active_type = gate_active_type,
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size),
DotMulProjection(state_memory,
parameter_name = para_prefix + "_input_check.w")],
)
ExpressionLayer(
name = name + "_" + "forget_gate",
active_type = gate_active_type,
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*2),
DotMulProjection(state_memory,
parameter_name = para_prefix + "_forget_check.w")],
)
ExpressionLayer(
name = name + "_" + "state",
inputs = [DotMulOperator([name + "_" + "input_s",
name + "_" + "input_gate"]),
DotMulOperator([state_memory,
name + "_" + "forget_gate"]),
],
)
ExpressionLayer(
name = name + "_" + "output_gate",
active_type = gate_active_type,
inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*3),
DotMulProjection(name + "_" + "state",
parameter_name = para_prefix + "_output_check.w")],
)
ExpressionLayer(
name = name + "_" + "state_atv",
active_type = state_active_type,
inputs = IdentityProjection(name + "_" + "state"),
)
ExpressionLayer(
name = name,
inputs = DotMulOperator([name + "_" + "state_atv",
name + "_" + "output_gate"]),
)
# like LstmRecurrentUnit, but it's a layer group.
# it is equivalent to LstmLayer
def LstmRecurrentLayerGroup(name, size,
active_type, state_active_type, gate_active_type,
inputs, para_prefix = None,
error_clipping_threshold = 0,
seq_reversed = False):
input_layer_name = name + "_" + "transform_input"
Layer(
name = input_layer_name,
type = "mixed",
size = size * 4,
active_type = "",
bias = False,
inputs = inputs,
)
RecurrentLayerGroupBegin(name + "_layer_group",
in_links = [input_layer_name],
out_links = [name],
seq_reversed = seq_reversed)
LstmRecurrentUnit(
name = name,
size = size,
active_type = active_type,
state_active_type = state_active_type,
gate_active_type = gate_active_type,
inputs = [IdentityProjection(input_layer_name)],
para_prefix = para_prefix,
error_clipping_threshold = error_clipping_threshold,
)
RecurrentLayerGroupEnd(name + "_layer_group")
# gated recurrent unit, can be used in recurrent machine
# *inputs* should be a list of Projections, for example:
# inputs = [FullMatrixProjection("input_layer_name")],
# *para_prefix* defines parameter names, if the *para_prefix* of
# two GatedRecurrentUnit is same, they share same parameters
# *out_memory* can be defined outside if it's used outside
def GatedRecurrentUnit(name, size,
active_type, gate_active_type,
inputs, para_prefix = None,
error_clipping_threshold = 0,
out_memory = None):
if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
input_layer_name = inputs
else:
input_layer_name = name + "_" + "transform_input"
Layer(
name = input_layer_name,
type = "mixed",
size = size * 3,
active_type = "",
bias = False,
inputs = inputs,
)
if para_prefix is None:
para_prefix = name
if out_memory is None:
out_memory = Memory(name = name, size = size)
GruStepLayer(
name = name,
size = size,
bias = Bias(parameter_name = para_prefix + "_gate.b"),
inputs = [input_layer_name,
Input(out_memory, parameter_name = para_prefix + "_gate.w")],
active_type = active_type,
active_gate_type = gate_active_type,
)
def GatedRecurrentUnitNaive(name, size,
active_type, gate_active_type,
inputs, para_prefix = None,
error_clipping_threshold = 0,
out_memory = None):
if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
input_layer_name = inputs
else:
input_layer_name = name + "_" + "transform_input"
Layer(
name = input_layer_name,
type = "mixed",
size = size * 3,
active_type = "",
bias = False,
inputs = inputs,
)
if para_prefix is None:
para_prefix = name
if out_memory is None:
out_memory = Memory(name = name, size = size)
Layer(
name = name + "_" + "update_gate",
type = "mixed",
size = size,
active_type = gate_active_type,
error_clipping_threshold = error_clipping_threshold,
bias = Bias(initial_std = 0, parameter_name = para_prefix + "_update_gate.b"),
inputs = [IdentityOffsetProjection(input_layer_name, offset=0),
FullMatrixProjection(out_memory,
parameter_name = para_prefix + "_update_gate.w")],
)
Layer(
name = name + "_" + "reset_gate",
type = "mixed",
size = size,
active_type = gate_active_type,
error_clipping_threshold = error_clipping_threshold,
bias = Bias(initial_std = 0, parameter_name = para_prefix + "_reset_gate.b"),
inputs = [IdentityOffsetProjection(input_layer_name, offset=size),
FullMatrixProjection(out_memory,
parameter_name = para_prefix + "_reset_gate.w")],
)
ExpressionLayer(
name = name + "_" + "reset_output",
inputs = DotMulOperator([out_memory, name + "_" + "reset_gate"]),
)
Layer(
name = name + "_" + "output_candidate",
type = "mixed",
size = size,
active_type = active_type,
error_clipping_threshold = error_clipping_threshold,
bias = Bias(initial_std = 0, parameter_name = para_prefix + "_output_candidate.b"),
inputs = [IdentityOffsetProjection(input_layer_name, offset=size*2),
FullMatrixProjection(name + "_" + "reset_output",
parameter_name = para_prefix + "_output_candidate.w")],
)
ExpressionLayer( #element-wise interpolation
name = name,
inputs = [IdentityProjection(out_memory),
DotMulOperator([out_memory,
name + "_" + "update_gate"], scale=-1.0),
DotMulOperator([name + "_" + "output_candidate",
name + "_" + "update_gate"]),
],
)
# like GatedRecurrentUnit, but it's a layer group.
# it is equivalent to GatedRecurrentLayer.
def GatedRecurrentLayerGroup(name, size,
active_type, gate_active_type,
inputs, para_prefix = None,
error_clipping_threshold = 0,
seq_reversed = False):
input_layer_name = name + "_" + "transform_input"
Layer(
name = input_layer_name,
type = "mixed",
size = size * 3,
active_type = "",
bias = False,
inputs = inputs,
)
RecurrentLayerGroupBegin(name + "_layer_group",
in_links = [input_layer_name],
out_links = [name],
seq_reversed = seq_reversed)
GatedRecurrentUnit(
name = name,
size = size,
active_type = active_type,
gate_active_type = gate_active_type,
inputs = input_layer_name, #transform outside
para_prefix = para_prefix,
error_clipping_threshold = error_clipping_threshold,
)
RecurrentLayerGroupEnd(name + "_layer_group")