You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
100 lines
3.4 KiB
100 lines
3.4 KiB
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from paddle.trainer_config_helpers import *
|
|
|
|
try:
|
|
import cPickle as pickle
|
|
except ImportError:
|
|
import pickle
|
|
|
|
is_predict = get_config_arg('is_predict', bool, False)
|
|
|
|
META_FILE = 'data/meta.bin'
|
|
|
|
with open(META_FILE, 'rb') as f:
|
|
# load meta file
|
|
meta = pickle.load(f)
|
|
|
|
settings(batch_size=1600, learning_rate=1e-3,
|
|
learning_method=RMSPropOptimizer())
|
|
|
|
|
|
def construct_feature(name):
|
|
"""
|
|
Construct movie/user features.
|
|
|
|
This method read from meta data. Then convert feature to neural network due
|
|
to feature type. The map relation as follow.
|
|
|
|
* id: embedding => fc
|
|
* embedding:
|
|
is_sequence: embedding => context_projection => fc => pool
|
|
not sequence: embedding => fc
|
|
* one_hot_dense: fc => fc
|
|
|
|
Then gather all features vector, and use a fc layer to combined them as
|
|
return.
|
|
|
|
:param name: 'movie' or 'user'
|
|
:type name: basestring
|
|
:return: combined feature output
|
|
:rtype: LayerOutput
|
|
"""
|
|
__meta__ = meta[name]['__meta__']['raw_meta']
|
|
fusion = []
|
|
for each_meta in __meta__:
|
|
type_name = each_meta['type']
|
|
slot_name = each_meta.get('name', '%s_id' % name)
|
|
if type_name == 'id':
|
|
slot_dim = each_meta['max']
|
|
embedding = embedding_layer(input=data_layer(slot_name,
|
|
size=slot_dim),
|
|
size=256)
|
|
fusion.append(fc_layer(input=embedding,
|
|
size=256))
|
|
elif type_name == 'embedding':
|
|
is_seq = each_meta['seq'] == 'sequence'
|
|
slot_dim = len(each_meta['dict'])
|
|
din = data_layer(slot_name, slot_dim)
|
|
embedding = embedding_layer(input=din, size=256)
|
|
if is_seq:
|
|
fusion.append(
|
|
text_conv_pool(input=embedding, context_len=5,
|
|
hidden_size=256))
|
|
else:
|
|
fusion.append(fc_layer(input=embedding,
|
|
size=256))
|
|
elif type_name == 'one_hot_dense':
|
|
slot_dim = len(each_meta['dict'])
|
|
hidden = fc_layer(input=data_layer(slot_name, slot_dim),
|
|
size=256)
|
|
fusion.append(fc_layer(input=hidden,
|
|
size=256))
|
|
|
|
return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
|
|
|
|
|
|
movie_feature = construct_feature("movie")
|
|
user_feature = construct_feature("user")
|
|
similarity = cos_sim(a=movie_feature, b=user_feature)
|
|
if not is_predict:
|
|
outputs(regression_cost(input=similarity,
|
|
label=data_layer('rating', size=1)))
|
|
|
|
define_py_data_sources2('data/train.list', 'data/test.list', module='dataprovider',
|
|
obj='process', args={'meta': meta})
|
|
else:
|
|
outputs(similarity)
|