|
|
|
@ -114,15 +114,15 @@ g_layer_type_map = {}
|
|
|
|
|
# Initialize global variables. We use this function so that we can
|
|
|
|
|
# call parse_config() multiple times
|
|
|
|
|
def init_config_environment(
|
|
|
|
|
g_default_momentum = 0.,
|
|
|
|
|
g_default_decay_rate = 0.,
|
|
|
|
|
g_default_momentum = None,
|
|
|
|
|
g_default_decay_rate = None,
|
|
|
|
|
g_default_initial_mean = 0.,
|
|
|
|
|
g_default_initial_std = 0.01,
|
|
|
|
|
g_default_num_batches_regularization = 1,
|
|
|
|
|
g_default_num_batches_regularization = None,
|
|
|
|
|
g_default_initial_strategy = 0,
|
|
|
|
|
g_default_initial_smart = False,
|
|
|
|
|
g_default_gradient_clipping_threshold = 0.,
|
|
|
|
|
g_default_device = -1,
|
|
|
|
|
g_default_gradient_clipping_threshold = None,
|
|
|
|
|
g_default_device = None,
|
|
|
|
|
g_default_update_hooks = None,
|
|
|
|
|
g_default_compact_func = None,
|
|
|
|
|
|
|
|
|
@ -1099,12 +1099,12 @@ def Evaluator(
|
|
|
|
|
inputs,
|
|
|
|
|
chunk_scheme = None,
|
|
|
|
|
num_chunk_types = None,
|
|
|
|
|
classification_threshold = 0.5,
|
|
|
|
|
positive_label = -1,
|
|
|
|
|
dict_file = "",
|
|
|
|
|
result_file = "",
|
|
|
|
|
num_results = 1,
|
|
|
|
|
delimited = True,
|
|
|
|
|
classification_threshold = None,
|
|
|
|
|
positive_label = None,
|
|
|
|
|
dict_file = None,
|
|
|
|
|
result_file = None,
|
|
|
|
|
num_results = None,
|
|
|
|
|
delimited = None,
|
|
|
|
|
):
|
|
|
|
|
evaluator = g_config.model_config.evaluators.add()
|
|
|
|
|
evaluator.type = type
|
|
|
|
@ -1120,12 +1120,19 @@ def Evaluator(
|
|
|
|
|
evaluator.num_chunk_types = num_chunk_types
|
|
|
|
|
g_current_submodel.evaluator_names.append(evaluator.name)
|
|
|
|
|
|
|
|
|
|
evaluator.classification_threshold = classification_threshold
|
|
|
|
|
evaluator.positive_label = positive_label
|
|
|
|
|
evaluator.dict_file = dict_file
|
|
|
|
|
evaluator.result_file = result_file
|
|
|
|
|
evaluator.num_results = num_results
|
|
|
|
|
evaluator.delimited = delimited
|
|
|
|
|
if classification_threshold is not None:
|
|
|
|
|
evaluator.classification_threshold = classification_threshold
|
|
|
|
|
if positive_label is not None:
|
|
|
|
|
evaluator.positive_label = positive_label
|
|
|
|
|
if dict_file is not None:
|
|
|
|
|
evaluator.dict_file = dict_file
|
|
|
|
|
|
|
|
|
|
if result_file is not None:
|
|
|
|
|
evaluator.result_file = result_file
|
|
|
|
|
if num_results is not None:
|
|
|
|
|
evaluator.num_results = num_results
|
|
|
|
|
if delimited is not None:
|
|
|
|
|
evaluator.delimited = delimited
|
|
|
|
|
|
|
|
|
|
class LayerBase(object):
|
|
|
|
|
def __init__(
|
|
|
|
@ -1137,7 +1144,7 @@ class LayerBase(object):
|
|
|
|
|
device=None,
|
|
|
|
|
active_type="",
|
|
|
|
|
drop_rate=0.,
|
|
|
|
|
coeff=1.):
|
|
|
|
|
coeff=None):
|
|
|
|
|
config_assert('@' not in name,
|
|
|
|
|
"layer name: %s contain special character @" % name)
|
|
|
|
|
global g_current_submodel
|
|
|
|
@ -1155,10 +1162,12 @@ class LayerBase(object):
|
|
|
|
|
self.inputs = [self.inputs]
|
|
|
|
|
|
|
|
|
|
self.config = g_config.model_config.layers.add()
|
|
|
|
|
assert isinstance(self.config, LayerConfig)
|
|
|
|
|
self.config.name = name
|
|
|
|
|
self.config.type = type
|
|
|
|
|
self.config.active_type = active_type
|
|
|
|
|
self.config.coeff = coeff
|
|
|
|
|
if coeff is not None:
|
|
|
|
|
self.config.coeff = float(coeff)
|
|
|
|
|
if size != 0:
|
|
|
|
|
self.config.size = size
|
|
|
|
|
if drop_rate != 0:
|
|
|
|
@ -1166,7 +1175,7 @@ class LayerBase(object):
|
|
|
|
|
|
|
|
|
|
if device is not None:
|
|
|
|
|
self.config.device = device
|
|
|
|
|
else:
|
|
|
|
|
elif g_default_device is not None:
|
|
|
|
|
self.config.device = g_default_device
|
|
|
|
|
|
|
|
|
|
for input_index in xrange(len(self.inputs)):
|
|
|
|
@ -1236,10 +1245,12 @@ class LayerBase(object):
|
|
|
|
|
if bias.parameter_name is None:
|
|
|
|
|
bias.parameter_name = gen_bias_parameter_name(self.config.name)
|
|
|
|
|
if bias.parameter_name not in g_parameter_map:
|
|
|
|
|
assert isinstance(self.config, LayerConfig)
|
|
|
|
|
|
|
|
|
|
Parameter(
|
|
|
|
|
bias.parameter_name,
|
|
|
|
|
size,
|
|
|
|
|
self.config.device,
|
|
|
|
|
self.config.device if self.config.HasField('device') else None,
|
|
|
|
|
dims,
|
|
|
|
|
bias.learning_rate,
|
|
|
|
|
bias.momentum,
|
|
|
|
@ -1265,7 +1276,7 @@ class LayerBase(object):
|
|
|
|
|
input_index,
|
|
|
|
|
size,
|
|
|
|
|
dims=None,
|
|
|
|
|
sparse = False,
|
|
|
|
|
sparse = None,
|
|
|
|
|
format = "csr"):
|
|
|
|
|
if dims is None:
|
|
|
|
|
# TODO(yuyang18): print warning and callstack here!
|
|
|
|
@ -1293,7 +1304,7 @@ class LayerBase(object):
|
|
|
|
|
Parameter(
|
|
|
|
|
input_config.parameter_name,
|
|
|
|
|
size,
|
|
|
|
|
self.config.device,
|
|
|
|
|
self.config.device if self.config.HasField("device") else None,
|
|
|
|
|
dims,
|
|
|
|
|
input_config.learning_rate,
|
|
|
|
|
input_config.momentum,
|
|
|
|
@ -1353,6 +1364,8 @@ class FCLayer(LayerBase):
|
|
|
|
|
|
|
|
|
|
if sparse:
|
|
|
|
|
psize = self.inputs[input_index].nnz
|
|
|
|
|
else:
|
|
|
|
|
sparse = None
|
|
|
|
|
|
|
|
|
|
self.create_input_parameter(input_index, psize, dims, sparse, format)
|
|
|
|
|
self.create_bias_parameter(bias, self.config.size)
|
|
|
|
@ -2836,27 +2849,44 @@ def Parameter(
|
|
|
|
|
para = g_config.model_config.parameters.add()
|
|
|
|
|
para.name = name
|
|
|
|
|
para.size = size
|
|
|
|
|
para.device = device
|
|
|
|
|
para.dims.extend(dims);
|
|
|
|
|
para.learning_rate = default(learning_rate, 1.)
|
|
|
|
|
para.momentum = default(momentum, g_default_momentum)
|
|
|
|
|
if device is not None:
|
|
|
|
|
para.device = int(device)
|
|
|
|
|
para.dims.extend(dims)
|
|
|
|
|
|
|
|
|
|
if learning_rate is not None:
|
|
|
|
|
para.learning_rate = float(learning_rate)
|
|
|
|
|
|
|
|
|
|
momentum = default(momentum, g_default_momentum)
|
|
|
|
|
if momentum is not None:
|
|
|
|
|
para.momentum = float(momentum)
|
|
|
|
|
|
|
|
|
|
config_assert(not momentum or not decay_rate_l1,
|
|
|
|
|
"momentum and decay_rate_l1 cannot both be non-zero")
|
|
|
|
|
para.decay_rate = default(decay_rate, g_default_decay_rate)
|
|
|
|
|
|
|
|
|
|
decay_rate = default(decay_rate, g_default_decay_rate)
|
|
|
|
|
if decay_rate is not None:
|
|
|
|
|
para.decay_rate = decay_rate
|
|
|
|
|
|
|
|
|
|
if decay_rate_l1 is not None:
|
|
|
|
|
para.decay_rate_l1 = decay_rate_l1
|
|
|
|
|
para.initial_std = default(initial_std, g_default_initial_std)
|
|
|
|
|
para.initial_mean = default(initial_mean, g_default_initial_mean)
|
|
|
|
|
para.num_batches_regularization = default(
|
|
|
|
|
|
|
|
|
|
num_batches_regularization = default(
|
|
|
|
|
num_batches_regularization, g_default_num_batches_regularization)
|
|
|
|
|
if num_batches_regularization is not None:
|
|
|
|
|
para.num_batches_regularization = int(num_batches_regularization)
|
|
|
|
|
|
|
|
|
|
if sparse_remote_update is not None:
|
|
|
|
|
para.sparse_remote_update = sparse_remote_update
|
|
|
|
|
if sparse_remote_update:
|
|
|
|
|
g_config.opt_config.use_sparse_remote_updater = True
|
|
|
|
|
if sparse_update is not None:
|
|
|
|
|
para.sparse_update = sparse_update
|
|
|
|
|
para.gradient_clipping_threshold = default(
|
|
|
|
|
gradient_clipping_threshold, g_default_gradient_clipping_threshold);
|
|
|
|
|
gradient_clipping_threshold = default(
|
|
|
|
|
gradient_clipping_threshold, g_default_gradient_clipping_threshold)
|
|
|
|
|
if gradient_clipping_threshold is not None:
|
|
|
|
|
para.gradient_clipping_threshold = gradient_clipping_threshold
|
|
|
|
|
para.initial_strategy = default(initial_strategy, g_default_initial_strategy)
|
|
|
|
|
para.initial_smart = default(initial_smart, g_default_initial_smart)
|
|
|
|
|
if para.initial_smart:
|
|
|
|
@ -2869,15 +2899,19 @@ def Parameter(
|
|
|
|
|
para.initial_std = 1. / math.sqrt(para.size)
|
|
|
|
|
if g_default_compact_func is not None:
|
|
|
|
|
sparse, format, need_compact = g_default_compact_func(para.name)
|
|
|
|
|
para.is_sparse = default(sparse, False)
|
|
|
|
|
para.format = default(format, "")
|
|
|
|
|
para.need_compact = default(need_compact, False)
|
|
|
|
|
|
|
|
|
|
if sparse is not None:
|
|
|
|
|
para.is_sparse = sparse
|
|
|
|
|
if format is not None:
|
|
|
|
|
para.format = format
|
|
|
|
|
if need_compact is not None:
|
|
|
|
|
para.need_compact = need_compact
|
|
|
|
|
if is_static is not None:
|
|
|
|
|
para.is_static = is_static
|
|
|
|
|
config_assert(not para.sparse_remote_update or not para.is_static,
|
|
|
|
|
"sparse_remote_update and is_static cannot both be true")
|
|
|
|
|
|
|
|
|
|
para.is_shared = default(is_shared, False)
|
|
|
|
|
if is_shared is not None:
|
|
|
|
|
para.is_shared = is_shared
|
|
|
|
|
|
|
|
|
|
update_hooks = default(update_hooks, g_default_update_hooks)
|
|
|
|
|
|
|
|
|
|