diff --git a/.travis.yml b/.travis.yml index 28d1f51be7..5a7f45a748 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,22 +4,14 @@ cache: - $HOME/third_party - $HOME/.ccache - $HOME/.cache/pip - - $HOME/Library/Caches/Homebrew sudo: required dist: trusty os: - linux - - osx env: - JOB=DOCS - JOB=BUILD_AND_TEST - JOB=PRE_COMMIT -matrix: - exclude: - - os: osx - env: JOB=DOCS # Only generate documentation in linux. - - os: osx - env: JOB=PRE_COMMIT # Only check pre-commit hook in linux addons: apt: @@ -53,7 +45,6 @@ before_install: fi fi fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake index d319442ef1..1c29cb22a3 100644 --- a/cmake/FindSphinx.cmake +++ b/cmake/FindSphinx.cmake @@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination ) ${source} ${destination} COMMENT "Generating sphinx documentation: ${builder}" - COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html + COMMAND cd ${destination} && ln -s ./index_*.html index.html ) set_property( diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake index ae3530c3a0..ad9a10cb86 100644 --- a/cmake/coverallsGcovJsons.cmake +++ b/cmake/coverallsGcovJsons.cmake @@ -110,14 +110,13 @@ endmacro() # Get the coverage data. file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda") -message("GCDA files:") +message("Process GCDA files:") +message("===============================") # Get a list of all the object directories needed by gcov # (The directories the .gcda files and .o files are found in) # and run gcov on those. foreach(GCDA ${GCDA_FILES}) - message("Process: ${GCDA}") - message("------------------------------------------------------------------------------") get_filename_component(GCDA_DIR ${GCDA} PATH) # @@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES}) # If -p is not specified then the file is named only "the_file.c.gcov" # execute_process( - COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} + COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null" WORKING_DIRECTORY ${GCDA_DIR} ) endforeach() @@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING}) set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") # Generate the final JSON for this file. - message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...") string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") endforeach() diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 84f459033f..26da7e8e38 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -14,46 +14,50 @@ INCLUDE(ExternalProject) -SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) -SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf) -SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE) +FIND_PACKAGE(Protobuf) -INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) +IF(NOT PROTOBUF_FOUND) + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf) + SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE) + + IF(WIN32) + SET(PROTOBUF_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) + SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) + ELSE(WIN32) + SET(PROTOBUF_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) + SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) + ENDIF(WIN32) -IF(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) -ELSE(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) -ENDIF(WIN32) + ExternalProject_Add( + protobuf + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PROTOBUF_SOURCES_DIR} + UPDATE_COMMAND "" + DEPENDS zlib + GIT_REPOSITORY "https://github.com/google/protobuf.git" + GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake + -Dprotobuf_BUILD_TESTS=OFF + -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=lib + ) -ExternalProject_Add( - protobuf - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${PROTOBUF_SOURCES_DIR} - UPDATE_COMMAND "" - DEPENDS zlib - GIT_REPOSITORY "https://github.com/google/protobuf.git" - GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" - CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake - -Dprotobuf_BUILD_TESTS=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib -) + LIST(APPEND external_project_dependencies protobuf) +ENDIF(NOT PROTOBUF_FOUND) -LIST(APPEND external_project_dependencies protobuf) +INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 6372a9a768..0accf1a8dd 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -221,7 +221,3 @@ ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) - -MESSAGE("[Paddle] Python Executable: ${PYTHON_EXECUTABLE}") -MESSAGE("[Paddle] Python Include: ${PYTHON_INCLUDE_DIRS}") -MESSAGE("[Paddle] Python Libraries: ${PYTHON_LIBRARIES}") diff --git a/demo/image_classification/api_v2_resnet.py b/demo/image_classification/api_v2_resnet.py new file mode 100644 index 0000000000..19d2054078 --- /dev/null +++ b/demo/image_classification/api_v2_resnet.py @@ -0,0 +1,74 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2 as paddle + +__all__ = ['resnet_cifar10'] + + +def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + active_type=paddle.activation.Relu(), + ch_in=None): + tmp = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=ch_in, + num_filters=ch_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.batch_norm(input=tmp, act=active_type) + + +def shortcut(ipt, n_in, n_out, stride): + if n_in != n_out: + return conv_bn_layer(ipt, n_out, 1, stride, 0, + paddle.activation.Linear()) + else: + return ipt + + +def basicblock(ipt, ch_out, stride): + ch_in = ch_out * 2 + tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1) + tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear()) + short = shortcut(ipt, ch_in, ch_out, stride) + return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu()) + + +def layer_warp(block_func, ipt, features, count, stride): + tmp = block_func(ipt, features, stride) + for i in range(1, count): + tmp = block_func(tmp, features, 1) + return tmp + + +def resnet_cifar10(ipt, depth=32): + # depth should be one of 20, 32, 44, 56, 110, 1202 + assert (depth - 2) % 6 == 0 + n = (depth - 2) / 6 + nStages = {16, 64, 128} + conv1 = conv_bn_layer( + ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, n, 1) + res2 = layer_warp(basicblock, res1, 32, n, 2) + res3 = layer_warp(basicblock, res2, 64, n, 2) + pool = paddle.layer.img_pool( + input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg()) + return pool diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py new file mode 100644 index 0000000000..585f61c6fa --- /dev/null +++ b/demo/image_classification/api_v2_train.py @@ -0,0 +1,91 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import sys +import paddle.v2 as paddle +from api_v2_vgg import vgg_bn_drop +from api_v2_resnet import resnet_cifar10 + + +def main(): + datadim = 3 * 32 * 32 + classdim = 10 + + # PaddlePaddle init + paddle.init(use_gpu=True, trainer_count=1) + + image = paddle.layer.data( + name="image", type=paddle.data_type.dense_vector(datadim)) + + # Add neural network config + # option 1. resnet + net = resnet_cifar10(image, depth=32) + # option 2. vgg + # net = vgg_bn_drop(image) + + out = paddle.layer.fc(input=net, + size=classdim, + act=paddle.activation.Softmax()) + + lbl = paddle.layer.data( + name="label", type=paddle.data_type.integer_value(classdim)) + cost = paddle.layer.classification_cost(input=out, label=lbl) + + # Create parameters + parameters = paddle.parameters.create(cost) + + # Create optimizer + momentum_optimizer = paddle.optimizer.Momentum( + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128), + learning_rate=0.1 / 128.0, + learning_rate_decay_a=0.1, + learning_rate_decay_b=50000 * 100, + learning_rate_schedule='discexp', + batch_size=128) + + # End batch and end pass event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batched( + paddle.dataset.cifar.test10(), batch_size=128), + reader_dict={'image': 0, + 'label': 1}) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + # Create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=momentum_optimizer) + trainer.train( + reader=paddle.reader.batched( + paddle.reader.shuffle( + paddle.dataset.cifar.train10(), buf_size=50000), + batch_size=128), + num_passes=5, + event_handler=event_handler, + reader_dict={'image': 0, + 'label': 1}) + + +if __name__ == '__main__': + main() diff --git a/demo/image_classification/api_v2_vgg.py b/demo/image_classification/api_v2_vgg.py new file mode 100644 index 0000000000..1e0e6b93ad --- /dev/null +++ b/demo/image_classification/api_v2_vgg.py @@ -0,0 +1,47 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2 as paddle + +__all__ = ['vgg_bn_drop'] + + +def vgg_bn_drop(input): + def conv_block(ipt, num_filter, groups, dropouts, num_channels=None): + return paddle.networks.img_conv_group( + input=ipt, + num_channels=num_channels, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act=paddle.activation.Relu(), + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type=paddle.pooling.Max()) + + conv1 = conv_block(input, 64, 2, [0.3, 0], 3) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5) + fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear()) + bn = paddle.layer.batch_norm( + input=fc1, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear()) + return fc2 diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py new file mode 100644 index 0000000000..75dd65f9fc --- /dev/null +++ b/demo/introduction/api_train_v2.py @@ -0,0 +1,58 @@ +import paddle.v2 as paddle +import paddle.v2.dataset.uci_housing as uci_housing + + +def main(): + # init + paddle.init(use_gpu=False, trainer_count=1) + + # network config + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, + param_attr=paddle.attr.Param(name='w'), + size=1, + act=paddle.activation.Linear(), + bias_attr=paddle.attr.Param(name='b')) + y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) + cost = paddle.layer.regression_cost(input=y_predict, label=y) + + # create parameters + parameters = paddle.parameters.create(cost) + + # create optimizer + optimizer = paddle.optimizer.Momentum(momentum=0) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) + + # event_handler to print training and testing info + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batched( + uci_housing.test(), batch_size=2), + reader_dict={'x': 0, + 'y': 1}) + if event.pass_id % 10 == 0: + print "Test %d, %s" % (event.pass_id, result.metrics) + + # training + trainer.train( + reader=paddle.reader.batched( + paddle.reader.shuffle( + uci_housing.train(), buf_size=500), + batch_size=2), + reader_dict={'x': 0, + 'y': 1}, + event_handler=event_handler, + num_passes=30) + + +if __name__ == '__main__': + main() diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index e92e0cad14..a72ebfa980 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -2,6 +2,59 @@ import paddle.v2 as paddle import cPickle +def softmax_regression(img): + predict = paddle.layer.fc(input=img, + size=10, + act=paddle.activation.Softmax()) + return predict + + +def multilayer_perceptron(img): + # The first fully-connected layer + hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu()) + # The second fully-connected layer and the according activation function + hidden2 = paddle.layer.fc(input=hidden1, + size=64, + act=paddle.activation.Relu()) + # The thrid fully-connected layer, note that the hidden size should be 10, + # which is the number of unique digits + predict = paddle.layer.fc(input=hidden2, + size=10, + act=paddle.activation.Softmax()) + return predict + + +def convolutional_neural_network(img): + # first conv layer + conv_pool_1 = paddle.networks.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + num_channel=1, + pool_size=2, + pool_stride=2, + act=paddle.activation.Tanh()) + # second conv layer + conv_pool_2 = paddle.networks.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + num_channel=20, + pool_size=2, + pool_stride=2, + act=paddle.activation.Tanh()) + # The first fully-connected layer + fc1 = paddle.layer.fc(input=conv_pool_2, + size=128, + act=paddle.activation.Tanh()) + # The softmax layer, note that the hidden size should be 10, + # which is the number of unique digits + predict = paddle.layer.fc(input=fc1, + size=10, + act=paddle.activation.Softmax()) + return predict + + def main(): paddle.init(use_gpu=False, trainer_count=1) @@ -10,12 +63,14 @@ def main(): name='pixel', type=paddle.data_type.dense_vector(784)) label = paddle.layer.data( name='label', type=paddle.data_type.integer_value(10)) - hidden1 = paddle.layer.fc(input=images, size=200) - hidden2 = paddle.layer.fc(input=hidden1, size=200) - inference = paddle.layer.fc(input=hidden2, - size=10, - act=paddle.activation.Softmax()) - cost = paddle.layer.classification_cost(input=inference, label=label) + + # Here we can build the prediction network in different ways. Please + # choose one by uncomment corresponding line. + predict = softmax_regression(images) + #predict = multilayer_perceptron(images) + #predict = convolutional_neural_network(images) + + cost = paddle.layer.classification_cost(input=predict, label=label) try: with open('params.pkl', 'r') as f: @@ -23,11 +78,16 @@ def main(): except IOError: parameters = paddle.parameters.create(cost) - adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Momentum( + learning_rate=0.1 / 128.0, + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128)) trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, - update_equation=adam_optimizer) + update_equation=optimizer) + + lists = [] def event_handler(event): if isinstance(event, paddle.event.EndIteration): @@ -43,20 +103,31 @@ def main(): cPickle.dump( parameters, f, protocol=cPickle.HIGHEST_PROTOCOL) - else: - pass + elif isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=paddle.reader.batched( + paddle.dataset.mnist.test(), batch_size=128)) + print "Test with Pass %d, Cost %f, %s\n" % ( + event.pass_id, result.cost, result.metrics) + lists.append((event.pass_id, result.cost, + result.metrics['classification_error_evaluator'])) trainer.train( reader=paddle.reader.batched( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), - batch_size=32), - event_handler=event_handler) + batch_size=128), + event_handler=event_handler, + num_passes=100) + + # find the best pass + best = sorted(lists, key=lambda list: float(list[1]))[0] + print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1]) + print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100) # output is a softmax layer. It returns probabilities. # Shape should be (100, 10) probs = paddle.infer( - output=inference, + output=predict, parameters=parameters, reader=paddle.reader.batched( paddle.reader.firstn( diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py new file mode 100644 index 0000000000..15db922b97 --- /dev/null +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -0,0 +1,190 @@ +import sys +import math +import numpy as np +import paddle.v2 as paddle +import paddle.v2.dataset.conll05 as conll05 + + +def db_lstm(): + word_dict, verb_dict, label_dict = conll05.get_dict() + word_dict_len = len(word_dict) + label_dict_len = len(label_dict) + pred_len = len(verb_dict) + + mark_dict_len = 2 + word_dim = 32 + mark_dim = 5 + hidden_dim = 512 + depth = 8 + + #8 features + def d_type(size): + return paddle.data_type.integer_value_sequence(size) + + word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) + predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) + + ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) + ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) + ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) + ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) + ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) + mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) + + target = paddle.layer.data(name='target', type=d_type(label_dict_len)) + + default_std = 1 / math.sqrt(hidden_dim) / 3.0 + + emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) + std_0 = paddle.attr.Param(initial_std=0.) + std_default = paddle.attr.Param(initial_std=default_std) + + predicate_embedding = paddle.layer.embedding( + size=word_dim, + input=predicate, + param_attr=paddle.attr.Param( + name='vemb', initial_std=default_std)) + mark_embedding = paddle.layer.embedding( + size=mark_dim, input=mark, param_attr=std_0) + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + emb_layers = [ + paddle.layer.embedding( + size=word_dim, input=x, param_attr=emb_para) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + hidden_0 = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=emb, param_attr=std_default) for emb in emb_layers + ]) + + mix_hidden_lr = 1e-3 + lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) + hidden_para_attr = paddle.attr.Param( + initial_std=default_std, learning_rate=mix_hidden_lr) + + lstm_0 = paddle.layer.lstmemory( + input=hidden_0, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + bias_attr=std_0, + param_attr=lstm_para_attr) + + #stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + for i in range(1, depth): + mix_hidden = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ]) + + lstm = paddle.layer.lstmemory( + input=mix_hidden, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + reverse=((i % 2) == 1), + bias_attr=std_0, + param_attr=lstm_para_attr) + + input_tmp = [mix_hidden, lstm] + + feature_out = paddle.layer.mixed( + size=label_dict_len, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ], ) + + crf_cost = paddle.layer.crf(size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', + initial_std=default_std, + learning_rate=mix_hidden_lr)) + + crf_dec = paddle.layer.crf_decoding( + name='crf_dec_l', + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) + + return crf_cost, crf_dec + + +def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header. + return np.fromfile(f, dtype=np.float32).reshape(h, w) + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + + # define network topology + crf_cost, crf_dec = db_lstm() + + # create parameters + parameters = paddle.parameters.create([crf_cost, crf_dec]) + + # create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0, + learning_rate=2e-2, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage( + average_window=0.5, max_average_window=10000), ) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + trainer = paddle.trainer.SGD(cost=crf_cost, + parameters=parameters, + update_equation=optimizer) + parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) + + trn_reader = paddle.reader.batched( + paddle.reader.shuffle( + conll05.test(), buf_size=8192), batch_size=10) + + reader_dict = { + 'word_data': 0, + 'ctx_n2_data': 1, + 'ctx_n1_data': 2, + 'ctx_0_data': 3, + 'ctx_p1_data': 4, + 'ctx_p2_data': 5, + 'verb_data': 6, + 'mark_data': 7, + 'target': 8 + } + + trainer.train( + reader=trn_reader, + event_handler=event_handler, + num_passes=10000, + reader_dict=reader_dict) + + +if __name__ == '__main__': + main() diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py new file mode 100644 index 0000000000..3a266e74ea --- /dev/null +++ b/demo/sentiment/train_v2.py @@ -0,0 +1,166 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle.trainer_config_helpers.attrs as attrs +from paddle.trainer_config_helpers.poolings import MaxPooling +import paddle.v2 as paddle + + +def convolution_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=128, + is_predict=False): + data = paddle.layer.data("word", + paddle.data_type.integer_value_sequence(input_dim)) + emb = paddle.layer.embedding(input=data, size=emb_dim) + conv_3 = paddle.networks.sequence_conv_pool( + input=emb, context_len=3, hidden_size=hid_dim) + conv_4 = paddle.networks.sequence_conv_pool( + input=emb, context_len=4, hidden_size=hid_dim) + output = paddle.layer.fc(input=[conv_3, conv_4], + size=class_dim, + act=paddle.activation.Softmax()) + lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) + cost = paddle.layer.classification_cost(input=output, label=lbl) + return cost + + +def stacked_lstm_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=512, + stacked_num=3, + is_predict=False): + """ + A Wrapper for sentiment classification task. + This network uses bi-directional recurrent network, + consisting three LSTM layers. This configure is referred to + the paper as following url, but use fewer layrs. + http://www.aclweb.org/anthology/P15-1109 + + input_dim: here is word dictionary dimension. + class_dim: number of categories. + emb_dim: dimension of word embedding. + hid_dim: dimension of hidden layer. + stacked_num: number of stacked lstm-hidden layer. + is_predict: is predicting or not. + Some layers is not needed in network when predicting. + """ + assert stacked_num % 2 == 1 + + layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5) + fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3) + lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.) + para_attr = [fc_para_attr, lstm_para_attr] + bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.) + relu = paddle.activation.Relu() + linear = paddle.activation.Linear() + + data = paddle.layer.data("word", + paddle.data_type.integer_value_sequence(input_dim)) + emb = paddle.layer.embedding(input=data, size=emb_dim) + + fc1 = paddle.layer.fc(input=emb, + size=hid_dim, + act=linear, + bias_attr=bias_attr) + lstm1 = paddle.layer.lstmemory( + input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) + + inputs = [fc1, lstm1] + for i in range(2, stacked_num + 1): + fc = paddle.layer.fc(input=inputs, + size=hid_dim, + act=linear, + param_attr=para_attr, + bias_attr=bias_attr) + lstm = paddle.layer.lstmemory( + input=fc, + reverse=(i % 2) == 0, + act=relu, + bias_attr=bias_attr, + layer_attr=layer_attr) + inputs = [fc, lstm] + + fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling()) + lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling()) + output = paddle.layer.fc(input=[fc_last, lstm_last], + size=class_dim, + act=paddle.activation.Softmax(), + bias_attr=bias_attr, + param_attr=para_attr) + + lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) + cost = paddle.layer.classification_cost(input=output, label=lbl) + return cost + + +if __name__ == '__main__': + # init + paddle.init(use_gpu=True, trainer_count=4) + + # network config + print 'load dictionary...' + word_dict = paddle.dataset.imdb.word_dict() + dict_dim = len(word_dict) + class_dim = 2 + + # Please choose the way to build the network + # by uncommenting the corresponding line. + cost = convolution_net(dict_dim, class_dim=class_dim) + # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) + + # create parameters + parameters = paddle.parameters.create(cost) + + # create optimizer + adam_optimizer = paddle.optimizer.Adam( + learning_rate=2e-3, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + + # End batch and end pass event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batched( + lambda: paddle.dataset.imdb.test(word_dict), + batch_size=128), + reader_dict={'word': 0, + 'label': 1}) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + # create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=adam_optimizer) + + trainer.train( + reader=paddle.reader.batched( + paddle.reader.shuffle( + lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), + batch_size=100), + event_handler=event_handler, + reader_dict={'word': 0, + 'label': 1}, + num_passes=10) diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py new file mode 100644 index 0000000000..74ae1cf9ec --- /dev/null +++ b/demo/seqToseq/api_train_v2.py @@ -0,0 +1,106 @@ +import os + +import paddle.v2 as paddle + +from seqToseq_net_v2 import seqToseq_net_v2 + +# Data Definiation. +# TODO:This code should be merged to dataset package. +data_dir = "./data/pre-wmt14" +src_lang_dict = os.path.join(data_dir, 'src.dict') +trg_lang_dict = os.path.join(data_dir, 'trg.dict') + +source_dict_dim = len(open(src_lang_dict, "r").readlines()) +target_dict_dim = len(open(trg_lang_dict, "r").readlines()) + + +def read_to_dict(dict_path): + with open(dict_path, "r") as fin: + out_dict = { + line.strip(): line_count + for line_count, line in enumerate(fin) + } + return out_dict + + +src_dict = read_to_dict(src_lang_dict) +trg_dict = read_to_dict(trg_lang_dict) + +train_list = os.path.join(data_dir, 'train.list') +test_list = os.path.join(data_dir, 'test.list') + +UNK_IDX = 2 +START = "" +END = "" + + +def _get_ids(s, dictionary): + words = s.strip().split() + return [dictionary[START]] + \ + [dictionary.get(w, UNK_IDX) for w in words] + \ + [dictionary[END]] + + +def train_reader(file_name): + def reader(): + with open(file_name, 'r') as f: + for line_count, line in enumerate(f): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_ids = _get_ids(src_seq, src_dict) + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + + # define network topology + cost = seqToseq_net_v2(source_dict_dim, target_dict_dim) + parameters = paddle.parameters.create(cost) + optimizer = paddle.optimizer.Adam(learning_rate=1e-4) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 10 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) + + reader_dict = { + 'source_language_word': 0, + 'target_language_word': 1, + 'target_language_next_word': 2 + } + + trn_reader = paddle.reader.batched( + paddle.reader.shuffle( + train_reader("data/pre-wmt14/train/train"), buf_size=8192), + batch_size=5) + + trainer.train( + reader=trn_reader, + event_handler=event_handler, + num_passes=10000, + reader_dict=reader_dict) + + +if __name__ == '__main__': + main() diff --git a/demo/seqToseq/seqToseq_net_v2.py b/demo/seqToseq/seqToseq_net_v2.py new file mode 100644 index 0000000000..1ac95686b4 --- /dev/null +++ b/demo/seqToseq/seqToseq_net_v2.py @@ -0,0 +1,90 @@ +import paddle.v2.activation as activation +import paddle.v2.attr as attr +import paddle.v2.data_type as data_type +import paddle.v2.layer as layer +import paddle.v2.networks as networks + + +def seqToseq_net_v2(source_dict_dim, target_dict_dim): + ### Network Architecture + word_vector_dim = 512 # dimension of word vector + decoder_size = 512 # dimension of hidden unit in GRU Decoder network + encoder_size = 512 # dimension of hidden unit in GRU Encoder network + + #### Encoder + src_word_id = layer.data( + name='source_language_word', + type=data_type.integer_value_sequence(source_dict_dim)) + src_embedding = layer.embedding( + input=src_word_id, + size=word_vector_dim, + param_attr=attr.ParamAttr(name='_source_language_embedding')) + src_forward = networks.simple_gru(input=src_embedding, size=encoder_size) + src_backward = networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = layer.concat(input=[src_forward, src_backward]) + + #### Decoder + with layer.mixed(size=decoder_size) as encoded_proj: + encoded_proj += layer.full_matrix_projection(input=encoded_vector) + + backward_first = layer.first_seq(input=src_backward) + + with layer.mixed(size=decoder_size, act=activation.Tanh()) as decoder_boot: + decoder_boot += layer.full_matrix_projection(input=backward_first) + + def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + + decoder_mem = layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + with layer.mixed(size=decoder_size * 3) as decoder_inputs: + decoder_inputs += layer.full_matrix_projection(input=context) + decoder_inputs += layer.full_matrix_projection(input=current_word) + + gru_step = layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + with layer.mixed( + size=target_dict_dim, bias_attr=True, + act=activation.Softmax()) as out: + out += layer.full_matrix_projection(input=gru_step) + return out + + decoder_group_name = "decoder_group" + group_input1 = layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input2 = layer.StaticInputV2(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + + trg_embedding = layer.embedding( + input=layer.data( + name='target_language_word', + type=data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = layer.data( + name='target_language_next_word', + type=data_type.integer_value_sequence(target_dict_dim)) + cost = layer.classification_cost(input=decoder, label=lbl) + + return cost diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst index 3718cd73a2..874dd9cb22 100644 --- a/doc/api/index_cn.rst +++ b/doc/api/index_cn.rst @@ -1,37 +1,2 @@ -API中文手册 -============ - -DataProvider API ----------------- - -.. toctree:: - :maxdepth: 1 - - data_provider/dataprovider_cn.rst - data_provider/pydataprovider2_cn.rst - -.. _api_trainer_config: - -Model Config API ----------------- - -.. toctree:: - :maxdepth: 1 - - trainer_config_helpers/optimizers.rst - trainer_config_helpers/data_sources.rst - trainer_config_helpers/layers.rst - trainer_config_helpers/activations.rst - trainer_config_helpers/poolings.rst - trainer_config_helpers/networks.rst - trainer_config_helpers/evaluators.rst - trainer_config_helpers/attrs.rst - - -Applications API ----------------- - -.. toctree:: - :maxdepth: 1 - - predict/swig_py_paddle_cn.rst +API +=== \ No newline at end of file diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst index 10c297a71d..b7f470e1f8 100644 --- a/doc/api/index_en.rst +++ b/doc/api/index_en.rst @@ -1,37 +1,10 @@ API === -DataProvider API ----------------- - -.. toctree:: - :maxdepth: 1 - - data_provider/dataprovider_en.rst - data_provider/pydataprovider2_en.rst - -.. _api_trainer_config: - Model Config API ---------------- .. toctree:: :maxdepth: 1 - trainer_config_helpers/optimizers.rst - trainer_config_helpers/data_sources.rst - trainer_config_helpers/layers.rst - trainer_config_helpers/activations.rst - trainer_config_helpers/poolings.rst - trainer_config_helpers/networks.rst - trainer_config_helpers/evaluators.rst - trainer_config_helpers/attrs.rst - - -Applications API ----------------- - -.. toctree:: - :maxdepth: 1 - - predict/swig_py_paddle_en.rst + v2/model_configs.rst \ No newline at end of file diff --git a/doc/api/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst similarity index 100% rename from doc/api/data_provider/dataprovider_cn.rst rename to doc/api/v1/data_provider/dataprovider_cn.rst diff --git a/doc/api/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst similarity index 100% rename from doc/api/data_provider/dataprovider_en.rst rename to doc/api/v1/data_provider/dataprovider_en.rst diff --git a/doc/api/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst similarity index 100% rename from doc/api/data_provider/pydataprovider2_cn.rst rename to doc/api/v1/data_provider/pydataprovider2_cn.rst diff --git a/doc/api/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst similarity index 100% rename from doc/api/data_provider/pydataprovider2_en.rst rename to doc/api/v1/data_provider/pydataprovider2_en.rst diff --git a/doc/api/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py similarity index 100% rename from doc/api/data_provider/src/mnist_config.py rename to doc/api/v1/data_provider/src/mnist_config.py diff --git a/doc/api/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py similarity index 100% rename from doc/api/data_provider/src/mnist_provider.dict.py rename to doc/api/v1/data_provider/src/mnist_provider.dict.py diff --git a/doc/api/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt similarity index 100% rename from doc/api/data_provider/src/mnist_train.txt rename to doc/api/v1/data_provider/src/mnist_train.txt diff --git a/doc/api/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py similarity index 100% rename from doc/api/data_provider/src/sentimental_config.py rename to doc/api/v1/data_provider/src/sentimental_config.py diff --git a/doc/api/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py similarity index 100% rename from doc/api/data_provider/src/sentimental_provider.py rename to doc/api/v1/data_provider/src/sentimental_provider.py diff --git a/doc/api/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt similarity index 100% rename from doc/api/data_provider/src/sentimental_train.txt rename to doc/api/v1/data_provider/src/sentimental_train.txt diff --git a/doc/api/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list similarity index 100% rename from doc/api/data_provider/src/train.list rename to doc/api/v1/data_provider/src/train.list diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst new file mode 100644 index 0000000000..3718cd73a2 --- /dev/null +++ b/doc/api/v1/index_cn.rst @@ -0,0 +1,37 @@ +API中文手册 +============ + +DataProvider API +---------------- + +.. toctree:: + :maxdepth: 1 + + data_provider/dataprovider_cn.rst + data_provider/pydataprovider2_cn.rst + +.. _api_trainer_config: + +Model Config API +---------------- + +.. toctree:: + :maxdepth: 1 + + trainer_config_helpers/optimizers.rst + trainer_config_helpers/data_sources.rst + trainer_config_helpers/layers.rst + trainer_config_helpers/activations.rst + trainer_config_helpers/poolings.rst + trainer_config_helpers/networks.rst + trainer_config_helpers/evaluators.rst + trainer_config_helpers/attrs.rst + + +Applications API +---------------- + +.. toctree:: + :maxdepth: 1 + + predict/swig_py_paddle_cn.rst diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst new file mode 100644 index 0000000000..10c297a71d --- /dev/null +++ b/doc/api/v1/index_en.rst @@ -0,0 +1,37 @@ +API +=== + +DataProvider API +---------------- + +.. toctree:: + :maxdepth: 1 + + data_provider/dataprovider_en.rst + data_provider/pydataprovider2_en.rst + +.. _api_trainer_config: + +Model Config API +---------------- + +.. toctree:: + :maxdepth: 1 + + trainer_config_helpers/optimizers.rst + trainer_config_helpers/data_sources.rst + trainer_config_helpers/layers.rst + trainer_config_helpers/activations.rst + trainer_config_helpers/poolings.rst + trainer_config_helpers/networks.rst + trainer_config_helpers/evaluators.rst + trainer_config_helpers/attrs.rst + + +Applications API +---------------- + +.. toctree:: + :maxdepth: 1 + + predict/swig_py_paddle_en.rst diff --git a/doc/api/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py similarity index 100% rename from doc/api/predict/src/predict_sample.py rename to doc/api/v1/predict/src/predict_sample.py diff --git a/doc/api/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst similarity index 100% rename from doc/api/predict/swig_py_paddle_cn.rst rename to doc/api/v1/predict/swig_py_paddle_cn.rst diff --git a/doc/api/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst similarity index 100% rename from doc/api/predict/swig_py_paddle_en.rst rename to doc/api/v1/predict/swig_py_paddle_en.rst diff --git a/doc/api/trainer_config_helpers/activations.rst b/doc/api/v1/trainer_config_helpers/activations.rst similarity index 100% rename from doc/api/trainer_config_helpers/activations.rst rename to doc/api/v1/trainer_config_helpers/activations.rst diff --git a/doc/api/trainer_config_helpers/attrs.rst b/doc/api/v1/trainer_config_helpers/attrs.rst similarity index 100% rename from doc/api/trainer_config_helpers/attrs.rst rename to doc/api/v1/trainer_config_helpers/attrs.rst diff --git a/doc/api/trainer_config_helpers/data_sources.rst b/doc/api/v1/trainer_config_helpers/data_sources.rst similarity index 100% rename from doc/api/trainer_config_helpers/data_sources.rst rename to doc/api/v1/trainer_config_helpers/data_sources.rst diff --git a/doc/api/trainer_config_helpers/evaluators.rst b/doc/api/v1/trainer_config_helpers/evaluators.rst similarity index 100% rename from doc/api/trainer_config_helpers/evaluators.rst rename to doc/api/v1/trainer_config_helpers/evaluators.rst diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst similarity index 100% rename from doc/api/trainer_config_helpers/layers.rst rename to doc/api/v1/trainer_config_helpers/layers.rst diff --git a/doc/api/trainer_config_helpers/networks.rst b/doc/api/v1/trainer_config_helpers/networks.rst similarity index 100% rename from doc/api/trainer_config_helpers/networks.rst rename to doc/api/v1/trainer_config_helpers/networks.rst diff --git a/doc/api/trainer_config_helpers/optimizers.rst b/doc/api/v1/trainer_config_helpers/optimizers.rst similarity index 100% rename from doc/api/trainer_config_helpers/optimizers.rst rename to doc/api/v1/trainer_config_helpers/optimizers.rst diff --git a/doc/api/trainer_config_helpers/poolings.rst b/doc/api/v1/trainer_config_helpers/poolings.rst similarity index 100% rename from doc/api/trainer_config_helpers/poolings.rst rename to doc/api/v1/trainer_config_helpers/poolings.rst diff --git a/doc/api/v2/model_configs.rst b/doc/api/v2/model_configs.rst new file mode 100644 index 0000000000..a9f33b33ef --- /dev/null +++ b/doc/api/v2/model_configs.rst @@ -0,0 +1,6 @@ +====== +Layers +====== + +.. automodule:: paddle.v2.layer + :members: diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 17d52b9e20..03119fdd74 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -4,9 +4,10 @@ At training and testing time, PaddlePaddle programs need to read data. To ease t - A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. - A *reader creator* is a function that returns a reader function. -- A *reader* decorator is a function, which accepts one or more readers, and returns a reader. +- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide frequently used reader creators and reader decorators. +and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface @@ -37,9 +38,54 @@ def reader_creator_random_imageand_label(widht, height, label): return reader ``` +## Batch Reader Interface + +*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. + +Here are valid outputs: +```python +# a mini batch of three data items. Each data item consist three columns of data, each of which is 1. +[(1, 1, 1), +(2, 2, 2), +(3, 3, 3)] + +# a mini batch of three data items, each data item is a list (single column). +[([1,1,1],), +([2,2,2],), +([3,3,3],), +``` + +Please note that each item inside the list must be a tuple, below is an invalid output: +```python + # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). + # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three column of datas, each of which is 1. +[[1,1,1], +[2,2,2], +[3,3,3]] +``` + +It's easy to convert from reader to batch reader: +```python +mnist_train = paddle.dataset.mnist.train() +mnist_train_batch_reader = paddle.batch(mnist_train, 128) +``` + +Also easy to create custom batch reader: +```python +def custom_batch_reader(): + while True: + batch = [] + for i in xrange(128): + batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. + yield batch + +mnist_random_image_batch_reader = custom_batch_reader +``` + ## Usage -data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: ```python # two data layer is created: @@ -47,8 +93,8 @@ image_layer = paddle.layer.data("image", ...) label_layer = paddle.layer.data("label", ...) # ... - -paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...) +batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) +paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ``` ## Data Reader Decorator @@ -64,7 +110,7 @@ Since reading data may take time and training can not proceed without data. It i Use `paddle.reader.buffered` to prefetch data: ```python -buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100) +buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ``` `buffered_reader` will try to buffer (prefetch) `100` data entries. @@ -91,10 +137,10 @@ def reader_creator_bool(t): true_reader = reader_creator_bool(True) false_reader = reader_creator_bool(False) -reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader) -# Skipped 1 because paddle.dataset.mnist produces two items per data entry. +reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) +# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. # And we don't care second item at this time. -paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) +paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle @@ -103,16 +149,20 @@ Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader Example: ```python -reader = paddle.reader.shuffle(paddle.dataset.mnist, 512) +reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ``` ## Q & A -### Why return only a single entry, but not a mini batch? +### Why reader return only a single entry, but not a mini batch? + +Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). + +We provide function `paddle.batch` to turn (single entry) reader into batch reader. -If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`. +### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? -Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. ### Why use a dictionary but not a list to provide mapping? @@ -137,7 +187,7 @@ def image_reader_creator(image_path, label_path, n): # images_reader_creator creates a reader reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) -paddle.train(reader, {"image":0, "label":1}, ...) +paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ``` ### How is `paddle.train` implemented @@ -145,17 +195,8 @@ paddle.train(reader, {"image":0, "label":1}, ...) An example implementation of paddle.train could be: ```python -def make_minibatch(reader, minibatch_size): - def ret(): - r = reader() - buf = [r.next() for x in xrange(minibatch_size)] - while len(buf) > 0: - yield buf - buf = [r.next() for x in xrange(minibatch_size)] - return ret - -def train(reader, mapping, batch_size, total_pass): +def train(batch_reader, mapping, batch_size, total_pass): for pass_idx in range(total_pass): - for mini_batch in make_minibatch(reader): # this loop will never end in online learning. + for mini_batch in batch_reader(): # this loop will never end in online learning. do_forward_backward(mini_batch, mapping) ``` diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py index f1a770ccb5..935c12bb67 100755 --- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py +++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py @@ -132,7 +132,8 @@ def startPaddle(idMap={}, train_args_dict=None): logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId) if not os.path.exists(JOB_PATH_OUTPUT): os.makedirs(JOB_PATH_OUTPUT) - os.mkdir(logDir) + if not os.path.exists(logDir): + os.mkdir(logDir) copyCommand = 'cp -rf ' + JOB_PATH + \ "/" + str(trainerId) + "/data/*" + " ./data/" os.system(copyCommand) diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in index 418d718fbd..6dc48704bc 100644 --- a/doc/templates/conf.py.cn.in +++ b/doc/templates/conf.py.cn.in @@ -15,13 +15,19 @@ import sys import os, subprocess import shlex from recommonmark import parser, transform +try: + import py_paddle + import paddle + import paddle.v2 +except ImportError: + print("Must install paddle python package before generating documentation") + sys.exit(1) MarkdownParser = parser.CommonMarkParser AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, '@PROJ_ROOT@/python') templates_path = ["@PROJ_ROOT@/doc_theme/templates"] # -- General configuration ------------------------------------------------ diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in index e96c25cb75..b477f0120c 100644 --- a/doc/templates/conf.py.en.in +++ b/doc/templates/conf.py.en.in @@ -15,14 +15,20 @@ import sys import os, subprocess import shlex from recommonmark import parser, transform +try: + import py_paddle + import paddle + import paddle.v2 +except ImportError: + print("Must install paddle python package before generating documentation") + sys.exit(1) + MarkdownParser = parser.CommonMarkParser AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, '@PROJ_ROOT@/python') - templates_path = ["@PROJ_ROOT@/doc_theme/templates"] # -- General configuration ------------------------------------------------ diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md index 70dec2eb2a..ca110431cf 100644 --- a/doc/tutorials/quick_start/index_en.md +++ b/doc/tutorials/quick_start/index_en.md @@ -156,14 +156,14 @@ define_py_data_sources2(train_list='data/train.list', obj="process", args={"dictionary": word_dict}) ``` -You can refer to the following link for more detailed examples and data formats: PyDataProvider2. +You can refer to the following link for more detailed examples and data formats: PyDataProvider2. ## Network Architecture We will describe four kinds of network architectures in this section.
![](./src/PipelineNetwork_en.jpg)
First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures. -For more detailed documentation, you could refer to: layer documentation. All configuration files are in `demo/quick_start` directory. +For more detailed documentation, you could refer to: layer documentation. All configuration files are in `demo/quick_start` directory. ### Logistic Regression The architecture is illustrated in the following picture: @@ -366,7 +366,7 @@ You can use single layer LSTM model with Dropout for our text classification pro
## Optimization Algorithm -Optimization algorithms include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network. +Optimization algorithms include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network. ```python settings(batch_size=128, @@ -407,7 +407,7 @@ paddle train \ --init_model_path=./output/pass-0000x ``` -We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to Python Prediction API tutorial,or other demo for the prediction process using Python. You can also use the following script for inference or evaluation. +We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to Python Prediction API tutorial,or other demo for the prediction process using Python. You can also use the following script for inference or evaluation. inference script (predict.sh): diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp index a3f4bfffc9..d49b189e25 100644 --- a/paddle/api/Arguments.cpp +++ b/paddle/api/Arguments.cpp @@ -144,9 +144,7 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) { a.cpuSequenceDims = m->cast(vec->getSharedPtr()); } -float Arguments::sumCosts() const { - return paddle::Argument::sumCosts(m->outputs); -} +float Arguments::sum() const { return paddle::Argument::sum(m->outputs); } int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) { auto& a = m->getArg(idx); diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp index 538ca2999f..dcb5fe086f 100644 --- a/paddle/api/GradientMachine.cpp +++ b/paddle/api/GradientMachine.cpp @@ -142,6 +142,20 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) { } } +size_t GradientMachine::getNonStaticParameterSize() const { + return m->machine->getNonStaticParameters().size(); +} + +Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) { + auto params = m->machine->getNonStaticParameters(); + if (i < params.size()) { + return Parameter::createFromSharedPtr( + &m->machine->getNonStaticParameters()[i]); + } else { + throw RangeError(); + } +} + void GradientMachine::randParameters() { m->machine->randParameters(); } Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h index d99e9a4ad4..c4f5dca26c 100644 --- a/paddle/api/PaddleAPI.h +++ b/paddle/api/PaddleAPI.h @@ -453,7 +453,7 @@ public: IVector* vec) throw(RangeError); void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError); - float sumCosts() const; + float sum() const; private: static Arguments* createByPaddleArgumentVector(void* ptr); @@ -771,6 +771,9 @@ public: size_t getParameterSize() const; Parameter* getParameter(size_t i) throw(RangeError); + size_t getNonStaticParameterSize() const; + Parameter* getNonStaticParameter(size_t i) throw(RangeError); + void randParameters(); Arguments* getLayerOutput(const std::string& layerName) const diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py index a04a805d7a..9fe44de94e 100644 --- a/paddle/api/test/testArguments.py +++ b/paddle/api/test/testArguments.py @@ -22,7 +22,7 @@ class TestArguments(unittest.TestCase): args = swig_paddle.Arguments.createArguments(1) args.setSlotValue(0, m) - self.assertAlmostEqual(27.0, args.sumCosts()) + self.assertAlmostEqual(27.0, args.sum()) mat = args.getSlotValue(0) assert isinstance(mat, swig_paddle.Matrix) diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index ae016e74ea..7617af10ba 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -24,7 +24,7 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) { if (weights) { outArgs[0].value->dotMul(*outArgs[0].value, *weights); } - return Argument::sumCosts(outArgs); + return Argument::sum(outArgs); } real getDiffAndPrint(real newCost1, @@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer, std::vector args; args.push_back(out); - EXPECT_EQ(0, Argument::sumCosts(args)) << "testBatchState failed"; + EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed"; for (size_t seqId = 0; seqId < numSequences; ++seqId) { start[seqId] += seqLens[seqId]; } @@ -672,7 +672,7 @@ void testLayerGradKernel(TestConfig testConf, outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights); } - real cost = Argument::sumCosts(outArgs); + real cost = Argument::sum(outArgs); LOG(INFO) << " cost " << cost; EXPECT_FALSE(std::isnan(cost)); diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 178c068b93..9ef44be0cb 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -163,7 +163,7 @@ struct Argument { : sequenceStartPositions->getData(false); } - static inline real sumCosts(const std::vector& arguments) { + static inline real sum(const std::vector& arguments) { real cost = 0; for (auto& arg : arguments) { if (arg.value) { diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt index 64654f67d0..6e8f9c37f6 100644 --- a/paddle/pserver/test/CMakeLists.txt +++ b/paddle/pserver/test/CMakeLists.txt @@ -10,9 +10,11 @@ add_test(NAME socket_test add_unittest_without_exec(test_ProtoServer test_ProtoServer.cpp) -add_test(NAME test_ProtoServer - COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port - ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer) +IF(NOT ON_TRAVIS) + add_test(NAME test_ProtoServer + COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port + ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer) +ENDIF(NOT ON_TRAVIS) # TODO(yuyang18): Run test_ProtoServer when with rdma # add_test(NAME test_ProtoServerRDMA diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py index a708def1d2..1c9455fab5 100644 --- a/paddle/py_paddle/util.py +++ b/paddle/py_paddle/util.py @@ -195,6 +195,12 @@ def __monkeypatch_gradient_machine__(): swig_paddle.GradientMachine.getParameters = getParameters + def getNonStaticParameters(self): + return (self.getNonStaticParameter(i) + for i in xrange(self.getNonStaticParameterSize())) + + swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters + def getLayerOutputs(self, layerNames): """ getLayerOutputs. get outputs of layers and return a numpy matrix dict. diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh deleted file mode 100755 index 80f031a74e..0000000000 --- a/paddle/scripts/travis/before_install.osx.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -brew update -brew tap homebrew/science -brew install openblas swig md5sha1sum diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh index 5e6350b574..7deb3e62e8 100755 --- a/paddle/scripts/travis/build_and_test.sh +++ b/paddle/scripts/travis/build_and_test.sh @@ -2,18 +2,11 @@ source ./common.sh NPROC=1 -if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages - export PYTHONHOME=/opt/python/2.7.12 - export PATH=/opt/python/2.7.12/bin:${PATH} - cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} - NRPOC=`nproc` - make -j $NPROC - make coveralls - sudo make install -elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then - export PYTHONPATH=/usr/local/lib/python2.7/site-packages - cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} - NPROC=`sysctl -n hw.ncpu` - make -j $NPROC -fi +export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages +export PYTHONHOME=/opt/python/2.7.12 +export PATH=/opt/python/2.7.12/bin:${PATH} +cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} +NRPOC=`nproc` +make -j $NPROC +make coveralls +sudo make install diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh index 6b43cad20b..53e998ef6c 100755 --- a/paddle/scripts/travis/docs.sh +++ b/paddle/scripts/travis/docs.sh @@ -2,8 +2,12 @@ # Add set -e, cd to directory. source ./common.sh - # Compile Documentation only. +cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS} +mkdir output +make DESTDIR=./output install -j `nproc` +pip install ./output/usr/local/opt/paddle/share/wheels/* +rm -rf * cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS} make paddle_docs paddle_docs_cn @@ -25,26 +29,41 @@ TARGET_BRANCH="gh-pages" # Only deploy master branch to build latest documentation. SOURCE_BRANCH="master" -# If is not a Github pull request, and in master branch. -if [ "$TRAVIS_PULL_REQUEST" != "false" -o "$TRAVIS_BRANCH" != "$SOURCE_BRANCH" ]; then - exit 0 -fi - # Clone the repo to output directory git clone $REPO output cd output -# checkout github page branch -git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH +function deploy_docs() { + SOURCE_BRANCH=$1 + DIR=$2 + # If is not a Github pull request + if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then + exit 0 + fi + # If it is not watched branch. + if [ "$TRAVIS_BRANCH" != "$SOURCE_BRANCH" ]; then + return + fi -# remove old docs. mv new docs. -rm -rf doc doc_cn -mv ../doc/cn/html doc_cn -mv ../doc/en/html doc + # checkout github page branch + git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH + + mkdir -p ${DIR} + # remove old docs. mv new docs. + set +e + rm -rf ${DIR}/doc ${DIR}/doc_cn + set -e + mv ../doc/cn/html ${DIR}/doc_cn + mv ../doc/en/html ${DIR}/doc + git add . +} + +deploy_docs "master" "." +deploy_docs "develop" "./develop/" # Check is there anything changed. set +e -git diff --exit-code >/dev/null +git diff --cached --exit-code >/dev/null if [ $? -eq 0 ]; then echo "No changes to the output on this push; exiting." exit 0 @@ -57,7 +76,6 @@ if [ -n $SSL_KEY ]; then # Only push updated docs for github.com/PaddlePaddle/P git config user.name "Travis CI" git config user.email "paddle-dev@baidu.com" git commit -m "Deploy to GitHub Pages: ${SHA}" - # Set ssh private key openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d chmod 600 deploy_key diff --git a/paddle/setup.py.in b/paddle/setup.py.in index 38621af065..382d5be6ec 100644 --- a/paddle/setup.py.in +++ b/paddle/setup.py.in @@ -72,6 +72,7 @@ setup(name="py_paddle", packages=['py_paddle'], include_dirs = include_dirs, install_requires = [ + 'nltk>=3.2.2', 'numpy>=1.8.0', # The numpy is required. 'protobuf>=3.0.0' # The paddle protobuf version ], diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp index 13aa28ae5d..80664fa877 100644 --- a/paddle/trainer/Tester.cpp +++ b/paddle/trainer/Tester.cpp @@ -208,7 +208,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch, return 0.0; // In this case, there is no meaning to calculate cost } - return Argument::sumCosts(outArgs); + return Argument::sum(outArgs); } void Tester::testOnePassBatch(int passId) { diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp index bd84545375..b68e29cd5e 100644 --- a/paddle/trainer/Trainer.cpp +++ b/paddle/trainer/Trainer.cpp @@ -310,7 +310,7 @@ real Trainer::checkGradient() { std::vector outArgs; trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); - real cost = Argument::sumCosts(outArgs); + real cost = Argument::sum(outArgs); LOG(INFO) << "original cost=" << cost; trainerInternal_.getGradientMachine()->backward(); @@ -340,7 +340,7 @@ real Trainer::checkGradient() { parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara); parameter->setValueUpdated(); trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); - real newCost1 = Argument::sumCosts(outArgs); + real newCost1 = Argument::sum(outArgs); for (size_t i = 0; i < dim; ++i) { newp[i] = oldp[i] - step * d[i]; @@ -349,7 +349,7 @@ real Trainer::checkGradient() { parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara); parameter->setValueUpdated(); trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); - real newCost2 = Argument::sumCosts(outArgs); + real newCost2 = Argument::sum(outArgs); real trueDelta = 0.5 * (newCost1 - newCost2); real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1; @@ -575,7 +575,7 @@ real Trainer::calcGradient(const DataBatch& dataBatch, trainerInternal_.getGradientMachine()->forwardBackward( inArgs, &outArgs, PASS_TRAIN); - real cost = Argument::sumCosts(outArgs); + real cost = Argument::sum(outArgs); offset = 0; for (auto& para : parameters) { diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp index f3b465b444..4c5d4a0913 100644 --- a/paddle/trainer/TrainerInternal.cpp +++ b/paddle/trainer/TrainerInternal.cpp @@ -134,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId, real cost = 0; { REGISTER_TIMER("sumCost"); - cost = Argument::sumCosts(*outArgs); + cost = Argument::sum(*outArgs); } if (batchId % intconfig_->log_period == 0) { diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index bd24c68b6f..4e3c4db853 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -65,14 +65,18 @@ def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE): return InputType(dim, seq_type, DataType.SparseValue) -def index_slot(dim, seq_type=SequenceType.NO_SEQUENCE): - return InputType(dim, seq_type, DataType.Index) +def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE): + """Data type of integer. + :param value_range: range of this integer. + """ + return InputType(value_range, seq_type, DataType.Index) dense_vector = dense_slot sparse_binary_vector = sparse_non_value_slot sparse_vector = sparse_value_slot integer_value = index_slot +integer_value.__doc__ = index_slot.__doc__ def dense_vector_sequence(dim): @@ -99,8 +103,11 @@ def sparse_vector_sub_sequence(dim): return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE) -def integer_value_sequence(dim): - return integer_value(dim, seq_type=SequenceType.SEQUENCE) +def integer_value_sequence(value_range): + """Data type of a sequence of integer. + :param value_range: range of each element. + """ + return integer_value(value_range, seq_type=SequenceType.SEQUENCE) def integer_value_sub_sequence(dim): @@ -108,6 +115,7 @@ def integer_value_sub_sequence(dim): integer_sequence = integer_value_sequence +integer_sequence.__doc__ = integer_value_sequence.__doc__ class SingleSlotWrapper(object): diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index d548d1adaa..f663ef735d 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -20,18 +20,19 @@ import event import data_type import topology import data_feeder +import networks from . import dataset from . import reader import attr import pooling -import inferencer +import inference import networks import py_paddle.swig_paddle as api __all__ = [ 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', 'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader', - 'topology', 'networks', 'inferencer', 'infer' + 'topology', 'networks', 'infer' ] @@ -43,4 +44,4 @@ def init(**kwargs): api.initPaddle(*args) -infer = inferencer.infer +infer = inference.infer diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py index 035f96b0f2..fa2ccec6c3 100644 --- a/python/paddle/v2/config_base.py +++ b/python/paddle/v2/config_base.py @@ -22,6 +22,7 @@ class Layer(object): def __init__(self, name=None, parent_layers=None): assert isinstance(parent_layers, dict) self.name = name + self.__contex__ = {} self.__parent_layers__ = parent_layers def to_proto(self, context): @@ -39,16 +40,38 @@ class Layer(object): self.__parent_layers__[layer_name]) kwargs[layer_name] = v1_layer - if self.name is None: + if self.context_name() is None: return self.to_proto_impl(**kwargs) - elif self.name not in context: - context[self.name] = self.to_proto_impl(**kwargs) - - return context[self.name] + elif self.context_name() not in context: + context[self.context_name()] = self.to_proto_impl(**kwargs) + self.__contex__ = context + if self.use_context_name(): + return context[self.context_name()] + else: + return context[self.name] def to_proto_impl(self, **kwargs): raise NotImplementedError() + def context_name(self): + """ + Context name means the context which stores `to_proto_impl` result. + If multiple layer share same context_name, the `to_proto_impl` of them + will be invoked only once. + """ + return self.name + + def use_context_name(self): + return False + + def calculate_size(self): + """ + lazy calculate size of the layer, should be called when to_proto_impl of + this layer is called. + :return: + """ + return self.__contex__[self.context_name()].size + def __convert_to_v2__(method_name, parent_names, is_default_name=True): if is_default_name: diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py index 9647e98503..6c371d3c9b 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/v2/dataset/__init__.py @@ -1,3 +1,28 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import mnist +import imikolov +import imdb +import cifar +import movielens +import conll05 +import uci_housing +import sentiment +import wmt14 -__all__ = ['mnist'] +__all__ = [ + 'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment' + 'uci_housing', 'wmt14' +] diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 77c54bd268..5c6f5d8556 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -1,6 +1,20 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html """ + import cPickle import itertools import numpy diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index fcf4437ffa..3021b68ddb 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -1,7 +1,22 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import requests import hashlib import os import shutil +import sys __all__ = ['DATA_HOME', 'download', 'md5file'] @@ -27,9 +42,24 @@ def download(url, module_name, md5sum): filename = os.path.join(dirname, url.split('/')[-1]) if not (os.path.exists(filename) and md5file(filename) == md5sum): + print "Cache file %s not found, downloading %s" % (filename, url) r = requests.get(url, stream=True) - with open(filename, 'w') as f: - shutil.copyfileobj(r.raw, f) + total_length = r.headers.get('content-length') + + if total_length is None: + with open(filename, 'w') as f: + shutil.copyfileobj(r.raw, f) + else: + with open(filename, 'w') as f: + dl = 0 + total_length = int(total_length) + for data in r.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + done = int(50 * dl / total_length) + sys.stdout.write("\r[%s%s]" % ('=' * done, + ' ' * (50 - done))) + sys.stdout.flush() return filename diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py new file mode 100644 index 0000000000..e96a701c1a --- /dev/null +++ b/python/paddle/v2/dataset/conll05.py @@ -0,0 +1,196 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tarfile +import gzip +import itertools +from common import download + +__all__ = ['test, get_dict', 'get_embedding'] +""" +Conll 2005 dataset. Paddle semantic role labeling Book and demo use this +dataset as an example. Because Conll 2005 is not free in public, the default +downloaded URL is test set of Conll 2005 (which is public). Users can change +URL and MD5 to their Conll dataset. +""" + +DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' +DATA_MD5 = '387719152ae52d60422c016e92a742fc' +WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' +VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' +TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' +EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' + +UNK_IDX = 0 + + +def load_dict(filename): + d = dict() + with open(filename, 'r') as f: + for i, line in enumerate(f): + d[line.strip()] = i + return d + + +def corpus_reader(data_path, words_name, props_name): + """ + Read one corpus. It returns an iterator. Each element of + this iterator is a tuple including sentence and labels. The sentence is + consist of a list of word IDs. The labels include a list of label IDs. + :return: a iterator of data. + :rtype: iterator + """ + + def reader(): + tf = tarfile.open(data_path) + wf = tf.extractfile(words_name) + pf = tf.extractfile(props_name) + with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( + fileobj=pf) as props_file: + sentences = [] + labels = [] + one_seg = [] + for word, label in itertools.izip(words_file, props_file): + word = word.strip() + label = label.strip().split() + + if len(label) == 0: # end of sentence + for i in xrange(len(one_seg[0])): + a_kind_lable = [x[i] for x in one_seg] + labels.append(a_kind_lable) + + if len(labels) >= 1: + verb_list = [] + for x in labels[0]: + if x != '-': + verb_list.append(x) + + for i, lbl in enumerate(labels[1:]): + cur_tag = 'O' + is_in_bracket = False + lbl_seq = [] + verb_word = '' + for l in lbl: + if l == '*' and is_in_bracket == False: + lbl_seq.append('O') + elif l == '*' and is_in_bracket == True: + lbl_seq.append('I-' + cur_tag) + elif l == '*)': + lbl_seq.append('I-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') != -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') == -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = True + else: + raise RuntimeError('Unexpected label: %s' % + l) + + yield sentences, verb_list[i], lbl_seq + + sentences = [] + labels = [] + one_seg = [] + else: + sentences.append(word) + one_seg.append(label) + + pf.close() + wf.close() + tf.close() + + return reader + + +def reader_creator(corpus_reader, + word_dict=None, + predicate_dict=None, + label_dict=None): + def reader(): + for sentence, predicate, labels in corpus_reader(): + + sen_len = len(sentence) + + verb_index = labels.index('B-V') + mark = [0] * len(labels) + if verb_index > 0: + mark[verb_index - 1] = 1 + ctx_n1 = sentence[verb_index - 1] + else: + ctx_n1 = 'bos' + + if verb_index > 1: + mark[verb_index - 2] = 1 + ctx_n2 = sentence[verb_index - 2] + else: + ctx_n2 = 'bos' + + mark[verb_index] = 1 + ctx_0 = sentence[verb_index] + + if verb_index < len(labels) - 1: + mark[verb_index + 1] = 1 + ctx_p1 = sentence[verb_index + 1] + else: + ctx_p1 = 'eos' + + if verb_index < len(labels) - 2: + mark[verb_index + 2] = 1 + ctx_p2 = sentence[verb_index + 2] + else: + ctx_p2 = 'eos' + + word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] + + ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len + ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len + ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len + ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len + ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + + pred_idx = [predicate_dict.get(predicate)] * sen_len + label_idx = [label_dict.get(w) for w in labels] + + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx + + return reader + + +def get_dict(): + word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) + verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) + label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) + return word_dict, verb_dict, label_dict + + +def get_embedding(): + return download(EMB_URL, 'conll05st', EMB_MD5) + + +def test(): + word_dict, verb_dict, label_dict = get_dict() + reader = corpus_reader( + download(DATA_URL, 'conll05st', DATA_MD5), + words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', + props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') + return reader_creator(reader, word_dict, verb_dict, label_dict) diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 433e37380f..f27756a38a 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -1,6 +1,3 @@ -# /usr/bin/env python -# -*- coding:utf-8 -*- - # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,6 +14,7 @@ """ IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz """ + import paddle.v2.dataset.common import tarfile import Queue @@ -118,3 +116,8 @@ def test(word_idx): return reader_creator( re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) + + +def word_dict(): + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index b3791ddad6..deb556942d 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -1,10 +1,23 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ """ import paddle.v2.dataset.common import tarfile -__all__ = ['train', 'test'] +__all__ = ['train', 'test', 'build_dict'] URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' MD5 = '30177ea32e27c525793142b6bf2c8e2d' @@ -24,7 +37,9 @@ def word_count(f, word_freq=None): return word_freq -def build_dict(train_filename, test_filename): +def build_dict(): + train_filename = './simple-examples/data/ptb.train.txt' + test_filename = './simple-examples/data/ptb.valid.txt' with tarfile.open( paddle.v2.dataset.common.download( paddle.v2.dataset.imikolov.URL, 'imikolov', @@ -32,27 +47,22 @@ def build_dict(train_filename, test_filename): trainf = tf.extractfile(train_filename) testf = tf.extractfile(test_filename) word_freq = word_count(testf, word_count(trainf)) + if '' in word_freq: + # remove for now, since we will set it as last index + del word_freq[''] TYPO_FREQ = 50 word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) - dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) - words, _ = list(zip(*dictionary)) + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) word_idx = dict(zip(words, xrange(len(words)))) word_idx[''] = len(words) return word_idx -word_idx = {} - - -def reader_creator(filename, n): - global word_idx - if len(word_idx) == 0: - word_idx = build_dict('./simple-examples/data/ptb.train.txt', - './simple-examples/data/ptb.valid.txt') - +def reader_creator(filename, word_idx, n): def reader(): with tarfile.open( paddle.v2.dataset.common.download( @@ -71,9 +81,9 @@ def reader_creator(filename, n): return reader -def train(n): - return reader_creator('./simple-examples/data/ptb.train.txt', n) +def train(word_idx, n): + return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n) -def test(n): - return reader_creator('./simple-examples/data/ptb.valid.txt', n) +def test(word_idx, n): + return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n) diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index ebcdff78b3..6a621a2aaa 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -1,3 +1,16 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ MNIST dataset. """ diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index dcffcff2f5..c22bcfa38b 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import zipfile from common import download import re diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py new file mode 100644 index 0000000000..cbd08fa736 --- /dev/null +++ b/python/paddle/v2/dataset/sentiment.py @@ -0,0 +1,126 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The script fetch and preprocess movie_reviews data set + +that provided by NLTK +""" + +import common +import collections +import nltk +import numpy as np +from itertools import chain +from nltk.corpus import movie_reviews + +__all__ = ['train', 'test', 'get_word_dict'] +NUM_TRAINING_INSTANCES = 1600 +NUM_TOTAL_INSTANCES = 2000 + + +def download_data_if_not_yet(): + """ + Download the data set, if the data set is not download. + """ + try: + # make sure that nltk can find the data + if common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(common.DATA_HOME) + movie_reviews.categories() + except LookupError: + print "Downloading movie_reviews data set, please wait....." + nltk.download('movie_reviews', download_dir=common.DATA_HOME) + print "Download data set success....." + print "Path is " + nltk.data.find('corpora/movie_reviews').path + + +def get_word_dict(): + """ + Sorted the words by the frequency of words which occur in sample + :return: + words_freq_sorted + """ + words_freq_sorted = list() + word_freq_dict = collections.defaultdict(int) + download_data_if_not_yet() + + for category in movie_reviews.categories(): + for field in movie_reviews.fileids(category): + for words in movie_reviews.words(field): + word_freq_dict[words] += 1 + words_sort_list = word_freq_dict.items() + words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + for index, word in enumerate(words_sort_list): + words_freq_sorted.append((word[0], index)) + return words_freq_sorted + + +def sort_files(): + """ + Sorted the sample for cross reading the sample + :return: + files_list + """ + files_list = list() + neg_file_list = movie_reviews.fileids('neg') + pos_file_list = movie_reviews.fileids('pos') + files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) + return files_list + + +def load_sentiment_data(): + """ + Load the data set + :return: + data_set + """ + data_set = list() + download_data_if_not_yet() + words_ids = dict(get_word_dict()) + for sample_file in sort_files(): + words_list = list() + category = 0 if 'neg' in sample_file else 1 + for word in movie_reviews.words(sample_file): + words_list.append(words_ids[word.lower()]) + data_set.append((words_list, category)) + return data_set + + +def reader_creator(data): + """ + Reader creator, generate an iterator for data set + :param data: + train data set or test data set + """ + for each in data: + yield each[0], each[1] + + +def train(): + """ + Default train set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[0:NUM_TRAINING_INSTANCES]) + + +def test(): + """ + Default test set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[NUM_TRAINING_INSTANCES:]) diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py index a2af45ecf5..e0e18229da 100644 --- a/python/paddle/v2/dataset/tests/cifar_test.py +++ b/python/paddle/v2/dataset/tests/cifar_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.cifar import unittest diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py index 7d8406171b..5babcef0eb 100644 --- a/python/paddle/v2/dataset/tests/common_test.py +++ b/python/paddle/v2/dataset/tests/common_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.common import unittest import tempfile diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py index e887af1663..c4d82f2689 100644 --- a/python/paddle/v2/dataset/tests/imdb_test.py +++ b/python/paddle/v2/dataset/tests/imdb_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.imdb import unittest import re diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py index 9b1748eaaa..009e55243a 100644 --- a/python/paddle/v2/dataset/tests/imikolov_test.py +++ b/python/paddle/v2/dataset/tests/imikolov_test.py @@ -1,6 +1,8 @@ import paddle.v2.dataset.imikolov import unittest +WORD_DICT = paddle.v2.dataset.imikolov.build_dict() + class TestMikolov(unittest.TestCase): def check_reader(self, reader, n): @@ -9,11 +11,15 @@ class TestMikolov(unittest.TestCase): def test_train(self): n = 5 - self.check_reader(paddle.v2.dataset.imikolov.train(n), n) + self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) def test_test(self): n = 5 - self.check_reader(paddle.v2.dataset.imikolov.test(n), n) + self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) + + def test_total(self): + _, idx = zip(*WORD_DICT.items()) + self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1) if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py index b4408cc2f5..1d344cac3e 100644 --- a/python/paddle/v2/dataset/tests/mnist_test.py +++ b/python/paddle/v2/dataset/tests/mnist_test.py @@ -1,3 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2.dataset.mnist import unittest diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py new file mode 100644 index 0000000000..4074052907 --- /dev/null +++ b/python/paddle/v2/dataset/tests/test_sentiment.py @@ -0,0 +1,55 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import nltk +import paddle.v2.dataset.sentiment as st +from nltk.corpus import movie_reviews + + +class TestSentimentMethods(unittest.TestCase): + def test_get_word_dict(self): + word_dict = st.get_word_dict()[0:10] + test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3), + (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7), + (u'is', 8), (u'in', 9)] + for idx, each in enumerate(word_dict): + self.assertEqual(each, test_word_list[idx]) + self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path) + + def test_sort_files(self): + last_label = '' + for sample_file in st.sort_files(): + current_label = sample_file.split("/")[0] + self.assertNotEqual(current_label, last_label) + last_label = current_label + + def test_data_set(self): + data_set = st.load_sentiment_data() + last_label = -1 + for each in st.test(): + self.assertNotEqual(each[1], last_label) + last_label = each[1] + self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES) + self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES) + self.assertEqual( + len(list(st.test())), + (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py new file mode 100644 index 0000000000..b5a0537af6 --- /dev/null +++ b/python/paddle/v2/dataset/uci_housing.py @@ -0,0 +1,86 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +from common import download + +__all__ = ['train', 'test'] + +URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' +MD5 = 'd4accdce7a25600298819f8e28e8d593' +feature_names = [ + 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', + 'PTRATIO', 'B', 'LSTAT' +] + +UCI_TRAIN_DATA = None +UCI_TEST_DATA = None + + +def feature_range(maximums, minimums): + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + feature_num = len(maximums) + ax.bar(range(feature_num), maximums - minimums, color='r', align='center') + ax.set_title('feature scale') + plt.xticks(range(feature_num), feature_names) + plt.xlim([-1, feature_num]) + fig.set_figheight(6) + fig.set_figwidth(10) + if not os.path.exists('./image'): + os.makedirs('./image') + fig.savefig('image/ranges.png', dpi=48) + plt.close(fig) + + +def load_data(filename, feature_num=14, ratio=0.8): + global UCI_TRAIN_DATA, UCI_TEST_DATA + if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None: + return + + data = np.fromfile(filename, sep=' ') + data = data.reshape(data.shape[0] / feature_num, feature_num) + maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( + axis=0) / data.shape[0] + feature_range(maximums[:-1], minimums[:-1]) + for i in xrange(feature_num - 1): + data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) + offset = int(data.shape[0] * ratio) + UCI_TRAIN_DATA = data[:offset] + UCI_TEST_DATA = data[offset:] + + +def train(): + global UCI_TRAIN_DATA + load_data(download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TRAIN_DATA: + yield d[:-1], d[-1:] + + return reader + + +def test(): + global UCI_TEST_DATA + load_data(download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TEST_DATA: + yield d[:-1], d[-1:] + + return reader diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py new file mode 100644 index 0000000000..9904848b5d --- /dev/null +++ b/python/paddle/v2/dataset/wmt14.py @@ -0,0 +1,142 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +wmt14 dataset +""" +import paddle.v2.dataset.common +import tarfile +import os.path +import itertools + +__all__ = ['train', 'test', 'build_dict'] + +URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' +MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' +URL_TRAIN = 'http://localhost:8000/train.tgz' +MD5_TRAIN = '72de99da2830ea5a3a2c4eb36092bbc7' + + +def word_count(f, word_freq=None): + add = paddle.v2.dataset.common.dict_add + if word_freq == None: + word_freq = {} + + for l in f: + for w in l.strip().split(): + add(word_freq, w) + add(word_freq, '') + add(word_freq, '') + + return word_freq + + +def get_word_dix(word_freq): + TYPO_FREQ = 50 + word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + return word_idx + + +def get_word_freq(train, dev): + word_freq = word_count(train, word_count(dev)) + if '' in word_freq: + # remove for now, since we will set it as last index + del word_freq[''] + return word_freq + + +def build_dict(): + base_dir = './wmt14-data' + train_en_filename = base_dir + '/train/train.en' + train_fr_filename = base_dir + '/train/train.fr' + dev_en_filename = base_dir + '/dev/ntst1213.en' + dev_fr_filename = base_dir + '/dev/ntst1213.fr' + + if not os.path.exists(train_en_filename) or not os.path.exists( + train_fr_filename): + with tarfile.open( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', + MD5_TRAIN)) as tf: + tf.extractall(base_dir) + + if not os.path.exists(dev_en_filename) or not os.path.exists( + dev_fr_filename): + with tarfile.open( + paddle.v2.dataset.common.download(URL_DEV_TEST, 'wmt14', + MD5_DEV_TEST)) as tf: + tf.extractall(base_dir) + + f_en = open(train_en_filename) + f_fr = open(train_fr_filename) + f_en_dev = open(dev_en_filename) + f_fr_dev = open(dev_fr_filename) + + word_freq_en = get_word_freq(f_en, f_en_dev) + word_freq_fr = get_word_freq(f_fr, f_fr_dev) + + f_en.close() + f_fr.close() + f_en_dev.close() + f_fr_dev.close() + + return get_word_dix(word_freq_en), get_word_dix(word_freq_fr) + + +def reader_creator(directory, path_en, path_fr, URL, MD5, dict_en, dict_fr): + def reader(): + if not os.path.exists(path_en) or not os.path.exists(path_fr): + with tarfile.open( + paddle.v2.dataset.common.download(URL, 'wmt14', MD5)) as tf: + tf.extractall(directory) + + f_en = open(path_en) + f_fr = open(path_fr) + UNK_en = dict_en[''] + UNK_fr = dict_fr[''] + + for en, fr in itertools.izip(f_en, f_fr): + src_ids = [dict_en.get(w, UNK_en) for w in en.strip().split()] + tar_ids = [ + dict_fr.get(w, UNK_fr) + for w in [''] + fr.strip().split() + [''] + ] + + # remove sequence whose length > 80 in training mode + if len(src_ids) == 0 or len(tar_ids) <= 1 or len( + src_ids) > 80 or len(tar_ids) > 80: + continue + + yield src_ids, tar_ids[:-1], tar_ids[1:] + + f_en.close() + f_fr.close() + + return reader + + +def train(dict_en, dict_fr): + directory = './wmt14-data' + return reader_creator(directory, directory + '/train/train.en', + directory + '/train/train.fr', URL_TRAIN, MD5_TRAIN, + dict_en, dict_fr) + + +def test(dict_en, dict_fr): + directory = './wmt14-data' + return reader_creator(directory, directory + '/dev/ntst1213.en', + directory + '/dev/ntst1213.fr', URL_DEV_TEST, + MD5_DEV_TEST, dict_en, dict_fr) diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index a78bcf076c..a429e36b63 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -34,8 +34,9 @@ class WithMetric(object): class TestResult(WithMetric): - def __init__(self, evaluator): + def __init__(self, evaluator, cost): super(TestResult, self).__init__(evaluator) + self.cost = cost class BeginPass(object): diff --git a/python/paddle/v2/inferencer.py b/python/paddle/v2/inference.py similarity index 98% rename from python/paddle/v2/inferencer.py rename to python/paddle/v2/inference.py index ac03b016c9..476fd3fa45 100644 --- a/python/paddle/v2/inferencer.py +++ b/python/paddle/v2/inference.py @@ -5,7 +5,7 @@ from data_feeder import DataFeeder import itertools import numpy -__all__ = ['Inference', 'infer'] +__all__ = ['infer'] class Inference(object): diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 67111f1315..711226d659 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -12,72 +12,42 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Before this new package paddle.v2.layer, users would need to use functions -in paddle.trainer_config_helpers.layers to configure networks. - -The Old Way: -========= -This old way requires that the creation of a network be defined in a Python -function, say network_config, and that this Python function being passed to -paddle.trainer_config_helpers.parse_network_config for the creation of -protobuf message description of this network. - -```python -def network_config(): - img = paddle.trainer_config_helpers.data_layer(name="pixel", size=784) - inference = paddle.trainer_config_helpers.fc_layer( - input=img, - size=10, - act=paddle.trainer_config_helpers.SoftmaxActivation()) - cost = paddle.trainer_config_helpers.classification_cost( - input=inference, - label=paddle.trainer_config_helpers.data_layer(name="label", size=10)) - -proto_desc = parse_network_config(network_config) -``` - -When parse_network_config executes network_config, those layer definition -functions like data_layer and fc_layer would change some Python global variables, -so that after the execution, parse_network_config could collect information from -these global variables and generates the protobuf message. - - - -The New Way: -========= -In this PR, we define a function in paddle.v2.layer which creates a Python -class for each layer creation function in paddle.trainer_config_helpers.layers. -Users can use create a network as follows: - -```python -img = paddle.v2.layer.data(name="pixel", size=784) -inference = paddle.v2.layer.fc(input=img, size=10, act=paddle.v2.layer.Softmax()) -cost = paddle.v2.layer.classification( - input=inference, - label=paddle.v2.layer.data(name="label", size=10)) - -parameters = paddle.v2.parameters.create(cost) -``` - -This new way doesn't require those invocations to layer definition functions -to be in a Python function but could be anywhere. - -Also, the creation of a protobuf message is hidden in the invocation of -paddle.v2.parameters.create, no longer exposed to users. +`paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2, +we want to make Paddle a plain Python package. The model config package defined +the way how to configure a neural network topology in Paddle Python code. + +The primary usage shows below. + +.. code-block:: python + + import paddle.v2 as paddle + + img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784)) + hidden = paddle.layer.fc(input=img, size=200) + prediction = paddle.layer.fc(input=hidden, size=10, + act=paddle.activation.Softmax()) + + # use prediction instance where needed. + parameters = paddle.v2.parameters.create(cost) """ + +import collections +import inspect from config_base import Layer, __convert_to_v2__ import paddle.trainer_config_helpers as conf_helps from paddle.trainer_config_helpers.config_parser_utils import \ parse_network_config as __parse__ - -from paddle.trainer_config_helpers.default_decorators import wrap_name_default from paddle.trainer_config_helpers.default_decorators import wrap_act_default from paddle.trainer_config_helpers.default_decorators import \ wrap_bias_attr_default +from paddle.trainer_config_helpers.default_decorators import wrap_name_default from paddle.trainer_config_helpers.layers import layer_support +from paddle.trainer.config_parser import \ + RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \ + RecurrentLayerGroupEnd, model_type -import data_type import activation +import data_type __all__ = ['parse_network', 'data'] @@ -130,6 +100,137 @@ class DataLayerV2(Layer): return getattr(conf_helps, self.__method_name__)(name=self.name, **args) +class WithExtraParent(Layer): + def extra_parent(self): + return self.__extra_parent__ + + def __init__(self, name=None, parent_layers=None): + self.__extra_parent__ = [] + super(WithExtraParent, self).__init__( + name=name, parent_layers=parent_layers) + + def append_extra_parent(self, parent): + self.__extra_parent__.append(parent) + + def to_proto(self, context): + """ + function to set proto attribute + """ + kwargs = dict() + for p in self.__extra_parent__: + p.to_proto(context=context) + + for layer_name in self.__parent_layers__: + if not isinstance(self.__parent_layers__[layer_name], + collections.Sequence): + v1_layer = self.__parent_layers__[layer_name].to_proto( + context=context) + else: + v1_layer = map(lambda x: x.to_proto(context=context), + self.__parent_layers__[layer_name]) + kwargs[layer_name] = v1_layer + + if self.context_name() is None: + return self.to_proto_impl(context=context, **kwargs) + elif self.context_name() not in context: + context[self.context_name()] = self.to_proto_impl( + context=context, **kwargs) + + if self.use_context_name(): + return context[self.context_name()] + else: + return context[self.name] + + +class MemoryV2(WithExtraParent): + def __init__(self, name, **kwargs): + self.name = name + super(MemoryV2, self).__init__(name=name, parent_layers=dict()) + self.__kwargs__ = kwargs + self.__boot_layer_name__ = None + if 'boot_layer' in kwargs: + begin_of_current_rnn = [] + # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a + # function inside step. + st = inspect.stack() + for i in xrange(len(st)): + locs = inspect.stack()[i][0].f_locals + keys = locs.keys() + for key in keys: + val = locs[key] + if isinstance(val, RecurrentLayerInput): + begin_of_current_rnn.append(val) + elif isinstance(val, collections.Sequence): + for v in val: + if isinstance(v, RecurrentLayerInput): + begin_of_current_rnn.append(v) + + if begin_of_current_rnn: + break + assert begin_of_current_rnn is not None + for extra in begin_of_current_rnn: + self.append_extra_parent(extra) + assert isinstance(extra, WithExtraParent) + extra.append_extra_parent(kwargs['boot_layer']) + self.__boot_layer_name__ = kwargs['boot_layer'].name + + def to_proto_impl(self, context, **kwargs): + args = dict() + for each in kwargs: + args[each] = kwargs[each] + for each in self.__kwargs__: + args[each] = self.__kwargs__[each] + + if self.__boot_layer_name__ is not None: + args['boot_layer'] = context[self.__boot_layer_name__] + + size = args.get('size', None) + if size is not None: + if callable(size): + real_size = size() + else: + real_size = size + args['size'] = real_size + return conf_helps.memory(name=self.name, **args) + + def context_name(self): + return self.name + "#memory" + + def use_context_name(self): + """ + memory layer will have the same name with some layer + :return: + """ + return True + + +class LayerOutputV2(Layer): + """ + LayerOutputV2 is used to store the result of LayerOutput in v1 api. + It will not store it's parents because layer_output has been parsed already. + """ + + def __init__(self, layer_output): + assert isinstance(layer_output, conf_helps.LayerOutput) + self.layer_output = layer_output + super(LayerOutputV2, self).__init__( + name=layer_output.name, parent_layers=dict()) + + def to_proto_impl(self): + return self.layer_output + + +class StaticInputV2(object): + def __init__(self, input, is_seq=False, size=None): + assert isinstance(input, LayerV2) + self.name = input.name + self.input = input + self.is_seq = is_seq + self.size = size + # TODO(add size check) + # assert input.size is not None or size is not None + + class MixedLayerV2(Layer): """ This class is use to support `with` grammar. If not, the following code @@ -161,7 +262,6 @@ class MixedLayerV2(Layer): other_kwargs['act'] = act other_kwargs['bias_attr'] = bias_attr other_kwargs['layer_attr'] = layer_attr - parent_layers = {"input": self.__inputs__} super(MixedLayerV2, self).__init__(name, parent_layers) self.__other_kwargs__ = other_kwargs @@ -171,7 +271,7 @@ class MixedLayerV2(Layer): self.__inputs__.append(other) return self else: - raise MixedLayerTypeV2.AddToSealedMixedLayerExceptionV2() + raise MixedLayerV2.AddToSealedMixedLayerExceptionV2() def __enter__(self): assert len(self.__inputs__) == 0 @@ -186,6 +286,13 @@ class MixedLayerV2(Layer): args[each] = kwargs[each] for each in self.__other_kwargs__: args[each] = self.__other_kwargs__[each] + size = args.get('size', None) + if size is not None: + if callable(size): + real_size = size() + else: + real_size = size + args['size'] = real_size return getattr(conf_helps, self.__method_name__)(**args) @@ -202,14 +309,51 @@ def mixed(size=0, return MixedLayerV2(size, input, name, act, bias_attr, layer_attr) +class RecurrentLayerInput(WithExtraParent): + def __init__(self, recurrent_name, index, parent_layers): + assert len(parent_layers) == 1 + self.__parents__ = parent_layers.values()[0] + super(RecurrentLayerInput, self).__init__( + name=self.__parents__[index].name, parent_layers=parent_layers) + self.__recurrent_name__ = recurrent_name + + def context_name(self): + return self.__recurrent_name__ + ".begin" + + def to_proto_impl(self, context, **kwargs): + model_type('recurrent_nn') + RecurrentLayerGroupWithoutOutLinksBegin( + name=self.__recurrent_name__, + in_links=map(lambda x: x.name, self.__parents__)) + return self + + +class RecurrentLayerOutput(Layer): + def __init__(self, recurrent_name, index, parent_layers): + assert len(parent_layers) == 1 + self.__parents__ = parent_layers.values()[0] + super(RecurrentLayerOutput, self).__init__( + name=self.__parents__[index].name, parent_layers=parent_layers) + self.__recurrent_name__ = recurrent_name + + def context_name(self): + return self.__recurrent_name__ + ".end" + + def to_proto_impl(self, **kwargs): + for l in self.__parents__: + RecurrentLayerGroupSetOutLink(l.name) + RecurrentLayerGroupEnd(name=self.__recurrent_name__) + + LayerV2 = Layer data = DataLayerV2 AggregateLevel = conf_helps.layers.AggregateLevel ExpandLevel = conf_helps.layers.ExpandLevel +memory = MemoryV2 def __layer_name_mapping__(inname): - if inname in ['data_layer', 'memory', 'mixed_layer']: + if inname in ['data_layer', 'memory', 'mixed_layer', 'recurrent_group']: # Do Not handle these layers return elif inname == 'maxid_layer': @@ -231,8 +375,10 @@ def __layer_name_mapping__(inname): def __layer_name_mapping_parent_names__(inname): all_args = getattr(conf_helps, inname).argspec.args return filter( - lambda x: x in ['input1', 'input2','label', 'input', 'a', 'b', 'expand_as', - 'weights', 'vectors', 'weight', 'score', 'left', 'right'], + lambda x: x in ['input1', 'input2', 'label', 'input', 'a', 'b', + 'expand_as', + 'weights', 'vectors', 'weight', 'score', 'left', + 'right', 'output_mem'], all_args) @@ -267,3 +413,54 @@ operator_list = [ for op in operator_list: globals()[op[0]] = __convert_to_v2__( op[0], parent_names=op[1], is_default_name=False) + + +@wrap_name_default() +def recurrent_group(step, input, name=None): + if not isinstance(input, collections.Sequence): + input = [input] + + non_static_inputs = filter(lambda x: not isinstance(x, StaticInputV2), + input) + actual_input = [ + RecurrentLayerInput( + recurrent_name=name, + index=i, + parent_layers={'recurrent_inputs': non_static_inputs}) + for i in xrange(len(non_static_inputs)) + ] + + def __real_step__(*args): + rnn_input = list(args) + static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input) + for static_input in static_inputs: + mem_name = "__%s_memory__" % static_input.input.name + mem = memory( + name=mem_name, + is_seq=static_input.is_seq, + size=static_input.input.calculate_size, + boot_layer=static_input.input) + with mixed( + name=mem_name, + size=static_input.input.calculate_size, + act=activation.Identity()) as mix: + mix += identity_projection(input=mem) + rnn_input.insert(input.index(static_input), mix) + return step(*rnn_input) + + actual_output = __real_step__(*actual_input) + + if not isinstance(actual_output, collections.Sequence): + actual_output = [actual_output] + + retv = [ + RecurrentLayerOutput( + recurrent_name=name, + index=i, + parent_layers={'recurrent_outputs': actual_output}) + for i in xrange(len(actual_output)) + ] + if len(retv) == 1: + return retv[0] + else: + return retv diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index 46b5d08b87..572deaff35 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -1,12 +1,16 @@ +add_test(NAME test_v2_api + COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE}) + add_test(NAME test_v2_layer COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/ ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_layer.py WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle) -add_test(NAME test_v2_api - COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE}) +add_test(NAME test_v2_rnn_layer + COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/ + ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_rnn_layer.py) -add_test(NAME topology_test +add_test(NAME test_topology COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/ ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_topology.py WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle) diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py index ab2bc5df76..71eb3bf314 100644 --- a/python/paddle/v2/tests/test_data_feeder.py +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -110,14 +110,14 @@ class DataFeederTest(unittest.TestCase): self.assertAlmostEqual(value.all(), w[i].all()) def test_integer(self): - dim = 100 + value_range = 100 batch_size = 32 index = [] for i in xrange(batch_size): each_sample = [] - each_sample.append(np.random.randint(dim)) + each_sample.append(np.random.randint(value_range)) index.append(each_sample) - feeder = DataFeeder([('input', data_type.integer_value(dim))], + feeder = DataFeeder([('input', data_type.integer_value(value_range))], {'input': 0}) arg = feeder(index) output = arg.getSlotIds(0).copyToNumpyArray() @@ -125,7 +125,7 @@ class DataFeederTest(unittest.TestCase): self.assertEqual(output.all(), index.flatten().all()) def test_integer_sequence(self): - dim = 10000 + value_range = 10000 batch_size = 32 start = [0] data = [] @@ -133,11 +133,12 @@ class DataFeederTest(unittest.TestCase): each_sample = [] each_sample.append( self.sparse_binary_reader( - dim, 30, non_empty=True)) + value_range, 30, non_empty=True)) data.append(each_sample) start.append(len(each_sample[0]) + start[-1]) - feeder = DataFeeder([('input', data_type.integer_value_sequence(dim))], - {'input': 0}) + feeder = DataFeeder( + [('input', data_type.integer_value_sequence(value_range))], + {'input': 0}) arg = feeder(data) output_data = arg.getSlotIds(0).copyToNumpyArray() output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray() diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py new file mode 100644 index 0000000000..5fbbd20eb7 --- /dev/null +++ b/python/paddle/v2/tests/test_rnn_layer.py @@ -0,0 +1,155 @@ +# Copyright PaddlePaddle contributors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import difflib +import unittest + +import paddle.trainer_config_helpers as conf_helps +import paddle.v2.activation as activation +import paddle.v2.data_type as data_type +import paddle.v2.layer as layer +from paddle.trainer_config_helpers.config_parser_utils import \ + parse_network_config as parse_network + + +class RNNTest(unittest.TestCase): + def test_simple_rnn(self): + dict_dim = 10 + word_dim = 8 + hidden_dim = 8 + + def parse_old_rnn(): + def step(y): + mem = conf_helps.memory(name="rnn_state", size=hidden_dim) + out = conf_helps.fc_layer( + input=[y, mem], + size=hidden_dim, + act=activation.Tanh(), + bias_attr=True, + name="rnn_state") + return out + + def test(): + data = conf_helps.data_layer(name="word", size=dict_dim) + embd = conf_helps.embedding_layer(input=data, size=word_dim) + conf_helps.recurrent_group(name="rnn", step=step, input=embd) + + return str(parse_network(test)) + + def parse_new_rnn(): + def new_step(y): + mem = layer.memory(name="rnn_state", size=hidden_dim) + out = layer.fc(input=[y, mem], + size=hidden_dim, + act=activation.Tanh(), + bias_attr=True, + name="rnn_state") + return out + + data = layer.data( + name="word", type=data_type.integer_value(dict_dim)) + embd = layer.embedding(input=data, size=word_dim) + rnn_layer = layer.recurrent_group( + name="rnn", step=new_step, input=embd) + return str(layer.parse_network(rnn_layer)) + + diff = difflib.unified_diff(parse_old_rnn().splitlines(1), + parse_new_rnn().splitlines(1)) + print ''.join(diff) + + def test_sequence_rnn_multi_input(self): + dict_dim = 10 + word_dim = 8 + hidden_dim = 8 + label_dim = 3 + + def parse_old_rnn(): + def test(): + data = conf_helps.data_layer(name="word", size=dict_dim) + label = conf_helps.data_layer(name="label", size=label_dim) + emb = conf_helps.embedding_layer(input=data, size=word_dim) + boot_layer = conf_helps.data_layer(name="boot", size=10) + boot_layer = conf_helps.fc_layer( + name='boot_fc', input=boot_layer, size=10) + + def step(y, wid): + z = conf_helps.embedding_layer(input=wid, size=word_dim) + mem = conf_helps.memory( + name="rnn_state", + size=hidden_dim, + boot_layer=boot_layer) + out = conf_helps.fc_layer( + input=[y, z, mem], + size=hidden_dim, + act=conf_helps.TanhActivation(), + bias_attr=True, + name="rnn_state") + return out + + out = conf_helps.recurrent_group( + name="rnn", step=step, input=[emb, data]) + + rep = conf_helps.last_seq(input=out) + prob = conf_helps.fc_layer( + size=label_dim, + input=rep, + act=conf_helps.SoftmaxActivation(), + bias_attr=True) + + conf_helps.outputs( + conf_helps.classification_cost( + input=prob, label=label)) + + return str(parse_network(test)) + + def parse_new_rnn(): + data = layer.data( + name="word", type=data_type.dense_vector(dict_dim)) + label = layer.data( + name="label", type=data_type.dense_vector(label_dim)) + emb = layer.embedding(input=data, size=word_dim) + boot_layer = layer.data( + name="boot", type=data_type.dense_vector(10)) + boot_layer = layer.fc(name='boot_fc', input=boot_layer, size=10) + + def step(y, wid): + z = layer.embedding(input=wid, size=word_dim) + mem = layer.memory( + name="rnn_state", size=hidden_dim, boot_layer=boot_layer) + out = layer.fc(input=[y, z, mem], + size=hidden_dim, + act=activation.Tanh(), + bias_attr=True, + name="rnn_state") + return out + + out = layer.recurrent_group( + name="rnn", step=step, input=[emb, data]) + + rep = layer.last_seq(input=out) + prob = layer.fc(size=label_dim, + input=rep, + act=activation.Softmax(), + bias_attr=True) + + cost = layer.classification_cost(input=prob, label=label) + + return str(layer.parse_network(cost)) + + diff = difflib.unified_diff(parse_old_rnn().splitlines(1), + parse_new_rnn().splitlines(1)) + print ''.join(diff) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index 4c21125431..f0679c5675 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -17,6 +17,7 @@ import collections from paddle.proto.ModelConfig_pb2 import ModelConfig import layer as v2_layer +from layer import WithExtraParent __all__ = ['Topology'] @@ -40,7 +41,10 @@ def __bfs_travel__(callback, *layers): __break__ = callback(each_layer) if __break__: return - __bfs_travel__(callback, *each_layer.__parent_layers__.values()) + __layers__ = each_layer.__parent_layers__.values() + if isinstance(each_layer, WithExtraParent): + __layers__ = __layers__ + each_layer.extra_parent() + __bfs_travel__(callback, *__layers__) class Topology(object): diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 709566ca44..e878ea6e3b 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -8,7 +8,7 @@ from . import event as v2_event from . import optimizer as v2_optimizer from . import parameters as v2_parameters -__all__ = ['ITrainer', 'SGD'] +__all__ = ['SGD'] def default_event_handler(event): @@ -22,26 +22,7 @@ def default_event_handler(event): pass -class ITrainer(object): - """ - The interface of Trainer. The only exposed method is `train`. - """ - - def train(self, reader, topology, parameters, event_handler=None): - """ - train method. - - :param reader: - :param topology: - :param parameters: - :param event_handler: - :return: - """ - - raise NotImplementedError() - - -class SGD(ITrainer): +class SGD(): def __init__(self, cost, parameters, update_equation): """ Simple SGD Trainer. @@ -108,9 +89,6 @@ class SGD(ITrainer): pass_evaluator.start() updater.startPass() for batch_id, data_batch in enumerate(reader()): - pass_type = updater.startBatch(len(data_batch)) - self.__gradient_machine__.forwardBackward( - feeder(data_batch), out_args, pass_type) batch_evaluator.start() event_handler( v2_event.BeginIteration( @@ -120,12 +98,11 @@ class SGD(ITrainer): feeder(data_batch), out_args, pass_type) self.__gradient_machine__.eval(pass_evaluator) self.__gradient_machine__.eval(batch_evaluator) - for each_param in self.__gradient_machine__.getParameters(): + for each_param in self.__gradient_machine__.getNonStaticParameters( + ): updater.update(each_param) - # Get cost. We use numpy to calculate total cost for this batch. - cost_vec = out_args.getSlotValue(0) - cost_vec = cost_vec.copyToNumpyMat() - cost = cost_vec.sum() / len(data_batch) + cost_sum = out_args.sumCosts() + cost = cost_sum / len(data_batch) updater.finishBatch(cost) batch_evaluator.finish() event_handler( @@ -154,13 +131,18 @@ class SGD(ITrainer): evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0) evaluator.start() + total_cost = 0 + num_samples = 0.0 for data_batch in reader(): + num_samples += len(data_batch) self.__gradient_machine__.forward( feeder(data_batch), out_args, api.PASS_TEST) + total_cost += out_args.sumCosts() self.__gradient_machine__.eval(evaluator) evaluator.finish() - return v2_event.TestResult(evaluator=evaluator) + return v2_event.TestResult( + evaluator=evaluator, cost=total_cost / num_samples) def __check_train_args__(reader, event_handler, **kwargs):