You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							256 lines
						
					
					
						
							8.4 KiB
						
					
					
				
			
		
		
	
	
							256 lines
						
					
					
						
							8.4 KiB
						
					
					
				| #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| 
 | |
| from __future__ import print_function
 | |
| 
 | |
| import numpy as np
 | |
| import argparse
 | |
| import time
 | |
| import math
 | |
| import random
 | |
| 
 | |
| import paddle
 | |
| import paddle.fluid as fluid
 | |
| import paddle.fluid.profiler as profiler
 | |
| from paddle.fluid import core
 | |
| import unittest
 | |
| from multiprocessing import Process
 | |
| import os
 | |
| import signal
 | |
| from functools import reduce
 | |
| from test_dist_base import TestDistRunnerBase, runtime_main
 | |
| 
 | |
| DTYPE = "int64"
 | |
| DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
 | |
| DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
 | |
| 
 | |
| # For Net
 | |
| base_lr = 0.2
 | |
| emb_lr = base_lr * 3
 | |
| dict_dim = 1500
 | |
| emb_dim = 128
 | |
| hid_dim = 128
 | |
| margin = 0.1
 | |
| sample_rate = 1
 | |
| 
 | |
| # Fix seed for test
 | |
| fluid.default_startup_program().random_seed = 1
 | |
| fluid.default_main_program().random_seed = 1
 | |
| 
 | |
| 
 | |
| def get_acc(cos_q_nt, cos_q_pt, batch_size):
 | |
|     cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
 | |
|     cond = fluid.layers.cast(cond, dtype='float64')
 | |
|     cond_3 = fluid.layers.reduce_sum(cond)
 | |
|     acc = fluid.layers.elementwise_div(
 | |
|         cond_3,
 | |
|         fluid.layers.fill_constant(
 | |
|             shape=[1], value=batch_size * 1.0, dtype='float64'),
 | |
|         name="simnet_acc")
 | |
|     return acc
 | |
| 
 | |
| 
 | |
| def get_loss(cos_q_pt, cos_q_nt):
 | |
|     loss_op1 = fluid.layers.elementwise_sub(
 | |
|         fluid.layers.fill_constant_batch_size_like(
 | |
|             input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
 | |
|         cos_q_pt)
 | |
|     loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
 | |
|     loss_op3 = fluid.layers.elementwise_max(
 | |
|         fluid.layers.fill_constant_batch_size_like(
 | |
|             input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
 | |
|         loss_op2)
 | |
|     avg_cost = fluid.layers.mean(loss_op3)
 | |
|     return avg_cost
 | |
| 
 | |
| 
 | |
| def get_optimizer(op="sgd"):
 | |
|     if op.upper() == "sgd".upper():
 | |
|         optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
 | |
|     elif op.upper() == "adam".upper():
 | |
|         optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
 | |
|     else:
 | |
|         optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
 | |
|     return optimizer
 | |
| 
 | |
| 
 | |
| def train_network(batch_size,
 | |
|                   is_distributed=False,
 | |
|                   is_sparse=False,
 | |
|                   is_self_contained_lr=False):
 | |
|     # query
 | |
|     q = fluid.layers.data(
 | |
|         name="query_ids", shape=[1], dtype="int64", lod_level=1)
 | |
|     ## embedding
 | |
|     q_emb = fluid.layers.embedding(
 | |
|         input=q,
 | |
|         is_distributed=is_distributed,
 | |
|         size=[dict_dim, emb_dim],
 | |
|         param_attr=fluid.ParamAttr(
 | |
|             initializer=fluid.initializer.Constant(value=0.01),
 | |
|             name="__emb__",
 | |
|             learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
 | |
|                 initializer=fluid.initializer.Constant(value=0.01),
 | |
|                 name="__emb__"),
 | |
|         is_sparse=is_sparse)
 | |
|     ## vsum
 | |
|     q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
 | |
|     q_ss = fluid.layers.softsign(q_sum)
 | |
|     ## fc layer after conv
 | |
|     q_fc = fluid.layers.fc(
 | |
|         input=q_ss,
 | |
|         size=hid_dim,
 | |
|         param_attr=fluid.ParamAttr(
 | |
|             initializer=fluid.initializer.Constant(value=0.01),
 | |
|             name="__q_fc__",
 | |
|             learning_rate=base_lr))
 | |
|     # label data
 | |
|     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 | |
|     # pt
 | |
|     pt = fluid.layers.data(
 | |
|         name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
 | |
|     ## embedding
 | |
|     pt_emb = fluid.layers.embedding(
 | |
|         input=pt,
 | |
|         is_distributed=is_distributed,
 | |
|         size=[dict_dim, emb_dim],
 | |
|         param_attr=fluid.ParamAttr(
 | |
|             initializer=fluid.initializer.Constant(value=0.01),
 | |
|             name="__emb__",
 | |
|             learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
 | |
|                 initializer=fluid.initializer.Constant(value=0.01),
 | |
|                 name="__emb__"),
 | |
|         is_sparse=is_sparse)
 | |
|     ## vsum
 | |
|     pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
 | |
|     pt_ss = fluid.layers.softsign(pt_sum)
 | |
|     ## fc layer
 | |
|     pt_fc = fluid.layers.fc(
 | |
|         input=pt_ss,
 | |
|         size=hid_dim,
 | |
|         param_attr=fluid.ParamAttr(
 | |
|             initializer=fluid.initializer.Constant(value=0.01),
 | |
|             name="__fc__",
 | |
|             learning_rate=base_lr),
 | |
|         bias_attr=fluid.ParamAttr(name="__fc_b__"))
 | |
|     # nt
 | |
|     nt = fluid.layers.data(
 | |
|         name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
 | |
|     ## embedding
 | |
|     nt_emb = fluid.layers.embedding(
 | |
|         input=nt,
 | |
|         is_distributed=is_distributed,
 | |
|         size=[dict_dim, emb_dim],
 | |
|         param_attr=fluid.ParamAttr(
 | |
|             initializer=fluid.initializer.Constant(value=0.01),
 | |
|             name="__emb__",
 | |
|             learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
 | |
|                 initializer=fluid.initializer.Constant(value=0.01),
 | |
|                 name="__emb__"),
 | |
|         is_sparse=is_sparse)
 | |
|     ## vsum
 | |
|     nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
 | |
|     nt_ss = fluid.layers.softsign(nt_sum)
 | |
|     ## fc layer
 | |
|     nt_fc = fluid.layers.fc(
 | |
|         input=nt_ss,
 | |
|         size=hid_dim,
 | |
|         param_attr=fluid.ParamAttr(
 | |
|             initializer=fluid.initializer.Constant(value=0.01),
 | |
|             name="__fc__",
 | |
|             learning_rate=base_lr),
 | |
|         bias_attr=fluid.ParamAttr(name="__fc_b__"))
 | |
|     cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
 | |
|     cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
 | |
|     # loss
 | |
|     avg_cost = get_loss(cos_q_pt, cos_q_nt)
 | |
|     # acc
 | |
|     acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
 | |
|     return [avg_cost, acc, cos_q_pt]
 | |
| 
 | |
| 
 | |
| def combination(x, y):
 | |
|     res = [[[xi, yi] for yi in y] for xi in x]
 | |
|     return res[0]
 | |
| 
 | |
| 
 | |
| def get_one_data(file_list):
 | |
|     for file in file_list:
 | |
|         contents = []
 | |
|         with open(file, "r") as fin:
 | |
|             for i in fin:
 | |
|                 contents.append(i.strip())
 | |
|             for index, q in enumerate(contents):
 | |
|                 try:
 | |
|                     one_data = [[int(j) for j in i.split(" ")]
 | |
|                                 for i in q.split(";")[:-1]]
 | |
|                     if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
 | |
|                         q = fin.readline()
 | |
|                         continue
 | |
|                     tmp = combination(one_data[3:3 + one_data[1][0]],
 | |
|                                       one_data[3 + one_data[1][0]:])
 | |
|                 except Exception as e:
 | |
|                     continue
 | |
| 
 | |
|                 for each in tmp:
 | |
|                     yield [one_data[2], 0, each[0], each[1]]
 | |
| 
 | |
| 
 | |
| def get_batch_reader(file_list, batch_size):
 | |
|     def batch_reader():
 | |
|         res = []
 | |
|         for i in get_one_data(file_list):
 | |
|             if random.random() <= sample_rate:
 | |
|                 res.append(i)
 | |
|             if len(res) >= batch_size:
 | |
|                 yield res
 | |
|                 res = []
 | |
| 
 | |
|     return batch_reader
 | |
| 
 | |
| 
 | |
| def get_train_reader(batch_size):
 | |
|     # The training data set.
 | |
|     train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
 | |
|                               "train")
 | |
|     train_reader = get_batch_reader([train_file], batch_size)
 | |
|     train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
 | |
|     return train_reader, train_feed
 | |
| 
 | |
| 
 | |
| class TestDistSimnetBow2x2(TestDistRunnerBase):
 | |
|     def get_model(self, batch_size=2):
 | |
|         # Train program
 | |
|         avg_cost, acc, predict = \
 | |
|             train_network(batch_size,
 | |
|                           bool(int(os.environ["IS_DISTRIBUTED"])),
 | |
|                           bool(int(os.environ["IS_SPARSE"])),
 | |
|                           bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
 | |
| 
 | |
|         inference_program = fluid.default_main_program().clone()
 | |
| 
 | |
|         # Optimization
 | |
|         opt = os.getenv('OPTIMIZER', 'sgd')
 | |
|         opt = get_optimizer(opt)
 | |
|         opt.minimize(avg_cost)
 | |
| 
 | |
|         # Reader
 | |
|         train_reader, _ = get_train_reader(batch_size)
 | |
|         return inference_program, avg_cost, train_reader, train_reader, acc, predict
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
 | |
|     runtime_main(TestDistSimnetBow2x2)
 |