Merge branch 'develop' of https://github.com/baidu/Paddle into cmrnorm

avx_docs
hedaoyuan 9 years ago
commit bf32411191

@ -29,10 +29,6 @@ addons:
- python-pip
- python2.7-dev
- m4
- libprotobuf-dev
- doxygen
- protobuf-compiler
- python-protobuf
- python-numpy
- python-wheel
- libgoogle-glog-dev
@ -43,6 +39,8 @@ addons:
- graphviz
- swig
- clang-format-3.8
- automake
- libtool
before_install:
- |
if [ ${JOB} == "BUILD_AND_TEST" ]; then

@ -1 +0,0 @@
./doc/howto/contribute_to_paddle_en.md

@ -0,0 +1 @@
./doc/howto/dev/contribute_to_paddle_en.md

@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
${source}
${destination}
COMMENT "Generating sphinx documentation: ${builder}"
COMMAND ln -s ${destination}/index_*.html ${destination}/index.html
COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
)
set_property(

@ -24,7 +24,6 @@ endif()
if(WITH_DOC)
find_package(Sphinx REQUIRED)
find_package(Doxygen REQUIRED)
find_python_module(recommonmark REQUIRED)
endif()

@ -1,3 +1,4 @@
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -87,9 +87,9 @@ def conv_bn(input,
print(imgSize, output_x, stride, filter_size, padding)
if trans:
nameApx = "_conv"
else:
nameApx = "_convt"
else:
nameApx = "_conv"
if bn:
conv = img_conv_layer(

@ -1,3 +1,4 @@
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
#
# {'img_size': 32,
# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>,
# 'settings': a global object,
# 'color': True,
# 'mean_img_size': 32,
# 'meta': './data/cifar-out/batches/batches.meta',
@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
settings.logger.info('Image size: %s', settings.img_size)
settings.logger.info('Meta path: %s', settings.meta_path)
settings.input_types = [
dense_vector(settings.img_raw_size), # image feature
integer_value(settings.num_classes)
] # labels
settings.input_types = {
'image': dense_vector(settings.img_raw_size),
'label': integer_value(settings.num_classes)
}
settings.logger.info('DataProvider Initialization finished')
@ -83,4 +83,7 @@ def processData(settings, file_list):
img, settings.img_mean, settings.img_size,
settings.is_train, settings.color)
label = data['labels'][i]
yield img_feat.astype('float32'), int(label)
yield {
'image': img_feat.astype('float32'),
'label': int(label)
}

@ -0,0 +1,5 @@
dataprovider.pyc
empty.list
train.log
output
train.list

@ -17,8 +17,10 @@ import random
# define data types of input: 2 real numbers
@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
@provider(
input_types={'x': dense_vector(1),
'y': dense_vector(1)}, use_seq=False)
def process(settings, input_file):
for i in xrange(2000):
x = random.random()
yield [x], [2 * x + 0.3]
yield {'x': [x], 'y': [2 * x + 0.3]}

@ -15,11 +15,8 @@
from paddle.trainer_config_helpers import *
# 1. read data. Suppose you saved above python code as dataprovider.py
data_file = 'empty.list'
with open(data_file, 'w') as f:
f.writelines(' ')
define_py_data_sources2(
train_list=data_file,
train_list=['no_matter.txt'],
test_list=None,
module='dataprovider',
obj='process',

@ -1,10 +1,12 @@
from paddle.trainer.PyDataProvider2 import *
import numpy
# Define a py data provider
@provider(
input_types={'pixel': dense_vector(28 * 28),
'label': integer_value(10)})
'label': integer_value(10)},
cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename): # settings is not used currently.
imgf = filename + "-images-idx3-ubyte"
labelf = filename + "-labels-idx1-ubyte"
@ -20,12 +22,13 @@ def process(settings, filename): # settings is not used currently.
else:
n = 10000
for i in range(n):
label = ord(l.read(1))
pixels = []
for j in range(28 * 28):
pixels.append(float(ord(f.read(1))) / 255.0)
yield {"pixel": pixels, 'label': label}
images = numpy.fromfile(
f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
images = images / 255.0 * 2.0 - 1.0
labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
for i in xrange(n):
yield {"pixel": images[i, :], 'label': labels[i]}
f.close()
l.close()

@ -8,6 +8,8 @@ data/test.list
data/test.txt
data/train.list
data/train.txt
data/pred.list
data/pred.txt
dataprovider_copy_1.py
train.log
output

@ -0,0 +1,147 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, sys
import numpy as np
from optparse import OptionParser
from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import sparse_binary_vector
from paddle.trainer.config_parser import parse_config
"""
Usage: run following command to show help message.
python api_predict.py -h
"""
class QuickStartPrediction():
def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
"""
train_conf: trainer configure.
dict_file: word dictionary file name.
model_dir: directory of model.
"""
self.train_conf = train_conf
self.dict_file = dict_file
self.word_dict = {}
self.dict_dim = self.load_dict()
self.model_dir = model_dir
if model_dir is None:
self.model_dir = os.path.dirname(train_conf)
self.label = None
if label_file is not None:
self.load_label(label_file)
conf = parse_config(train_conf, "is_predict=1")
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(self.model_dir)
input_types = [sparse_binary_vector(self.dict_dim)]
self.converter = DataProviderConverter(input_types)
def load_dict(self):
"""
Load dictionary from self.dict_file.
"""
for line_count, line in enumerate(open(self.dict_file, 'r')):
self.word_dict[line.strip().split('\t')[0]] = line_count
return len(self.word_dict)
def load_label(self, label_file):
"""
Load label.
"""
self.label = {}
for v in open(label_file, 'r'):
self.label[int(v.split('\t')[1])] = v.split('\t')[0]
def get_index(self, data):
"""
transform word into integer index according to the dictionary.
"""
words = data.strip().split()
word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
return word_slot
def batch_predict(self, data_batch):
input = self.converter(data_batch)
output = self.network.forwardTest(input)
prob = output[0]["id"].tolist()
print("predicting labels is:")
print prob
def option_parser():
usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option(
"-n",
"--tconf",
action="store",
dest="train_conf",
help="network config")
parser.add_option(
"-d",
"--dict",
action="store",
dest="dict_file",
help="dictionary file")
parser.add_option(
"-b",
"--label",
action="store",
dest="label",
default=None,
help="dictionary file")
parser.add_option(
"-c",
"--batch_size",
type="int",
action="store",
dest="batch_size",
default=1,
help="the batch size for prediction")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
return parser.parse_args()
def main():
options, args = option_parser()
train_conf = options.train_conf
batch_size = options.batch_size
dict_file = options.dict_file
model_path = options.model_path
label = options.label
swig_paddle.initPaddle("--use_gpu=0")
predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
batch = []
labels = []
for line in sys.stdin:
[label, text] = line.split("\t")
labels.append(int(label))
batch.append([predict.get_index(text)])
print("labels is:")
print labels
predict.batch_predict(batch)
if __name__ == '__main__':
main()

@ -0,0 +1,30 @@
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
#Note the default model is pass-00002, you shold make sure the model path
#exists or change the mode path.
#only test on trainer_config.lr.py
model=output/pass-00001/
config=trainer_config.lr.py
label=data/labels.list
dict=data/dict.txt
batch_size=20
head -n$batch_size data/test.txt | python api_predict.py \
--tconf=$config\
--model=$model \
--label=$label \
--dict=$dict \
--batch_size=$batch_size

@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs):
# setting.input_types specifies what the data types the data provider
# generates.
settings.input_types = [
settings.input_types = {
# The first input is a sparse_binary_vector,
# which means each dimension of the vector is either 0 or 1. It is the
# bag-of-words (BOW) representation of the texts.
sparse_binary_vector(len(dictionary)),
'word': sparse_binary_vector(len(dictionary)),
# The second input is an integer. It represents the category id of the
# sample. 2 means there are two labels in the dataset.
# (1 for positive and 0 for negative)
integer_value(2)
]
'label': integer_value(2)
}
# Delaring a data provider. It has an initializer 'data_initialzer'.
@ -67,12 +67,12 @@ def process(settings, file_name):
# Return the features for the current comment. The first is a list
# of ids representing a 0-1 binary sparse vector of the text,
# the second is the integer id of the label.
yield word_vector, int(label)
yield {'word': word_vector, 'label': int(label)}
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [sparse_binary_vector(len(dictionary))]
settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
# Declaring a data provider for prediction. The difference with process
@ -83,4 +83,4 @@ def process_predict(settings, file_name):
for line in f:
comment = line.strip().split()
word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield word_vector
yield {'word': word_vector}

@ -19,13 +19,13 @@ UNK_IDX = 0
def initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
settings.input_types = {
# Define the type of the first input as sequence of integer.
# The value of the integers range from 0 to len(dictrionary)-1
integer_value_sequence(len(dictionary)),
'word': integer_value_sequence(len(dictionary)),
# Define the second input for label id
integer_value(2)
]
'label': integer_value(2)
}
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
@ -35,15 +35,12 @@ def process(settings, file_name):
label, comment = line.strip().split('\t')
words = comment.split()
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
yield word_slot, int(label)
yield {'word': word_slot, 'label': int(label)}
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
integer_value(
len(dictionary), seq_type=SequenceType.SEQUENCE)
]
settings.input_types = {'word': integer_value_sequence(len(dictionary))}
@provider(init_hook=predict_initializer, should_shuffle=False)
@ -52,4 +49,4 @@ def process_predict(settings, file_name):
for line in f:
comment = line.strip().split()
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield word_slot
yield {'word': word_slot}

@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import *
def meta_to_header(meta, name):
metas = meta[name]['__meta__']['raw_meta']
for each_meta in metas:
slot_name = each_meta.get('name', '%s_id' % name)
if each_meta['type'] == 'id':
yield integer_value(each_meta['max'])
yield slot_name, integer_value(each_meta['max'])
elif each_meta['type'] == 'embedding':
is_seq = each_meta['seq'] == 'sequence'
yield integer_value(
yield slot_name, integer_value(
len(each_meta['dict']),
seq_type=SequenceType.SEQUENCE
if is_seq else SequenceType.NO_SEQUENCE)
elif each_meta['type'] == 'one_hot_dense':
yield dense_vector(len(each_meta['dict']))
yield slot_name, dense_vector(len(each_meta['dict']))

@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import *
import common_utils # parse
def __list_to_map__(lst):
ret_val = dict()
for each in lst:
k, v = each
ret_val[k] = v
return ret_val
def hook(settings, meta, **kwargs):
"""
Init hook is invoked before process data. It will set obj.slots and store
@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs):
# second part is user features.
# final part is rating score.
# header is a list of [USE_SEQ_OR_NOT?, SlotType]
headers = list(common_utils.meta_to_header(meta, 'movie'))
headers.extend(list(common_utils.meta_to_header(meta, 'user')))
headers.append(dense_vector(1)) # Score
movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
settings.movie_names = [h[0] for h in movie_headers]
headers = movie_headers
user_headers = list(common_utils.meta_to_header(meta, 'user'))
settings.user_names = [h[0] for h in user_headers]
headers.extend(user_headers)
headers.append(("rating", dense_vector(1))) # Score
# slot types.
settings.input_types = headers
settings.input_types = __list_to_map__(headers)
settings.meta = meta
@ -57,20 +69,20 @@ def process(settings, filename):
movie_meta = settings.meta['movie'][movie_id]
user_meta = settings.meta['user'][user_id]
outputs = [movie_id - 1]
outputs = [('movie_id', movie_id - 1)]
# Then add movie features
for each_meta in movie_meta:
outputs.append(each_meta)
for i, each_meta in enumerate(movie_meta):
outputs.append((settings.movie_names[i + 1], each_meta))
# Then add user id.
outputs.append(user_id - 1)
outputs.append(('user_id', user_id - 1))
# Then add user features.
for each_meta in user_meta:
outputs.append(each_meta)
for i, each_meta in enumerate(user_meta):
outputs.append((settings.user_names[i + 1], each_meta))
# Finally, add score
outputs.append([score])
outputs.append(('rating', [score]))
# Return data to paddle
yield outputs
yield __list_to_map__(outputs)

@ -34,8 +34,8 @@ if __name__ == '__main__':
network.loadParameters(model_path)
with open('./data/meta.bin', 'rb') as f:
meta = pickle.load(f)
headers = list(meta_to_header(meta, 'movie'))
headers.extend(list(meta_to_header(meta, 'user')))
headers = [h[1] for h in meta_to_header(meta, 'movie')]
headers.extend([h[1] for h in meta_to_header(meta, 'user')])
cvt = DataProviderConverter(headers)
while True:
movie_id = int(raw_input("Input movie_id: "))

@ -14,6 +14,15 @@
# limitations under the License.
set -e
UNAME_STR=`uname`
if [[ ${UNAME_STR} == 'Linux' ]]; then
SHUF_PROG='shuf'
else
SHUF_PROG='gshuf'
fi
cd "$(dirname "$0")"
delimiter='::'
dir=ml-1m
@ -25,7 +34,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json
echo 'split train/test file'
python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
echo 'shuffle train file'
shuf $dir/ratings.dat.train > ratings.dat.train
${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
cp $dir/ratings.dat.test .
echo "./data/ratings.dat.train" > train.list
echo "./data/ratings.dat.test" > test.list

@ -8,3 +8,7 @@ data/test.wsj.seq_pair
data/test.wsj.words
data/tgt.dict
output
data/emb
data/targetDict.txt
data/verbDict.txt
data/wordDict.txt

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save