You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
3.4 KiB
97 lines
3.4 KiB
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import tarfile
|
|
import paddle.fluid as fluid
|
|
import paddle
|
|
from paddle.fluid import core
|
|
|
|
URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
|
|
MD5 = '2a405a31508969b3ab823f42c0f522ca'
|
|
|
|
|
|
def bow_net(data,
|
|
label,
|
|
dict_dim=89528,
|
|
emb_dim=128,
|
|
hid_dim=128,
|
|
hid_dim2=96,
|
|
class_dim=2):
|
|
"""
|
|
BOW net
|
|
This model is from https://github.com/PaddlePaddle/models:
|
|
models/fluid/PaddleNLP/text_classification/nets.py
|
|
"""
|
|
# embedding
|
|
emb = fluid.layers.embedding(
|
|
input=data, size=[dict_dim, emb_dim], is_sparse=True)
|
|
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
|
|
bowh = fluid.layers.tanh(bow)
|
|
# fc layer after conv
|
|
fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
|
|
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
|
|
# probability of each class
|
|
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
|
|
# cross entropy loss
|
|
cost = fluid.layers.cross_entropy(input=prediction, label=label)
|
|
# mean loss
|
|
avg_cost = fluid.layers.mean(x=cost)
|
|
acc = fluid.layers.accuracy(input=prediction, label=label)
|
|
return avg_cost, acc, prediction
|
|
|
|
|
|
def train():
|
|
# Download data
|
|
with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
|
|
tarf.extractall(path='./')
|
|
tarf.close()
|
|
|
|
# Initialize dataset description
|
|
dataset = fluid.DatasetFactory().create_dataset()
|
|
dataset.set_batch_size(128) # See API doc for how to change other fields
|
|
|
|
# define network
|
|
# input text data
|
|
data = fluid.layers.data(
|
|
name="words", shape=[1], dtype="int64", lod_level=1)
|
|
# label data
|
|
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
|
|
dataset.set_use_var([data, label])
|
|
avg_cost, acc, prediction = bow_net(data, label)
|
|
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
|
|
opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
|
|
|
|
# Run startup program
|
|
startup_program = fluid.default_startup_program()
|
|
place = fluid.CPUPlace()
|
|
executor = fluid.Executor(place)
|
|
executor.run(startup_program)
|
|
|
|
main_program = fluid.default_main_program()
|
|
epochs = 10
|
|
filelist = ["train_data/part-%d" % i for i in range(12)]
|
|
dataset.set_filelist(filelist)
|
|
for i in range(epochs):
|
|
dataset.set_thread(4)
|
|
executor.train_from_dataset(
|
|
main_program, # This can be changed during iteration
|
|
dataset, # This can be changed during iteration
|
|
debug=False)
|
|
fluid.io.save_inference_model('imdb/epoch%d.model' % i,
|
|
[data.name, label.name], [acc], executor)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
train()
|