Merge branch 'develop' of github.com:baidu/Paddle into feature/refine_doc_drnn

8 years ago · ff5ca6927f
parent 634576128c f8ec510a63
commit ff5ca6927f
183 changed files with 2058 additions and 796 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -42,7 +42,7 @@ addons:
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
+      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
      then
        echo "Only markdown docs were updated, stopping build process."
        exit
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
 option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
 option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
@ -115,7 +116,6 @@ else()
    endif(WITH_AVX)

    if(WITH_DSO)
-        set(CUDA_LIBRARIES "")
        add_definitions(-DPADDLE_USE_DSO)
    endif(WITH_DSO)

@ -135,6 +135,10 @@ if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)

+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
 if(WITH_AVX)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@ -24,7 +24,7 @@ paddle train \
 --test_all_data_in_one_period=1 \
 --use_gpu=1 \
 --trainer_count=1 \
--num_passes=200 \
+--num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log

--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@ -18,7 +18,5 @@ set -x
 # download the dictionary and pretrained model 
 for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
 done
--- a/demo/model_zoo/resnet/get_model.sh
+++ b/demo/model_zoo/resnet/get_model.sh
@ -24,9 +24,7 @@ echo "Downloading ResNet models..."

 for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
  tar -xvf $file 
  rm $file
 done
--- a/demo/quick_start/data/README.md
+++ b/demo/quick_start/data/README.md
@ -0,0 +1,9 @@
+This dataset consists of electronics product reviews associated with
+binary labels (positive/negative) for sentiment classification.
+
+The preprocessed data can be downloaded by script `get_data.sh`.
+The data was derived from reviews_Electronics_5.json.gz at
+
+http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+
+If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
--- a/demo/quick_start/data/get_data.sh
+++ b/demo/quick_start/data/get_data.sh
@ -17,14 +17,11 @@ set -e
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
 cd $DIR

-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+# Download the preprocessed data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz

-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+# Extract package
+tar zxvf preprocessed_data.tar.gz

-unzip master.zip
-rm master.zip
-echo "Done."
+# Remove compressed package
+rm preprocessed_data.tar.gz
--- a/demo/quick_start/data/pred.list
+++ b/demo/quick_start/data/pred.list
@ -1 +0,0 @@
-./data/pred.txt
--- a/demo/quick_start/data/pred.txt
+++ b/demo/quick_start/data/pred.txt
@ -1,2 +0,0 @@
-the device is cute , but that &apos;s just about all that &apos;s good. the specs are what you &apos;d expect : it &apos;s a wifi mic , with some noise filter options. the app has the option to upload your baby &apos;s name and photo , which is a cutesy touch. but the app is otherwise unstable and useless unless you upgrade for $ 60 / year.set up involves downloading the app , turning on the mic , switching your phone to the wifi network of the mic , telling the app your wifi settings , switching your wifi back to your home router. the app is then directly connected to your mic.the app is adware ! the main screen says &quot; cry notifications on / off : upgrade to evoz premium and receive a text message of email when your baby is crying &quot; .but the adware points out an important limitation , this monitor is only intended to be used from your home network. if you want to access it remotely , get a webcam. this app would make a lot more sense of the premium features were included with the hardware .
-don &apos;t be fooled by my one star rating. if there was a zero , i would have selected it. this product was a waste of my money.it has never worked like the company said it supposed to. i only have one device , an iphone 4gs. after charging the the iphone mid way , the i.sound portable power max 16,000 mah is completely drained. the led light no longer lit up. when plugging the isound portable power max into a wall outlet to charge , it would charge for about 20-30 minutes and then all four battery led indicator lit up showing a full charge. i would leave it on to charge for the full 8 hours or more but each time with the same result upon using. don &apos;t buy this thing. put your money to good use elsewhere .
--- a/demo/quick_start/data/proc_from_raw_data/get_data.sh
+++ b/demo/quick_start/data/proc_from_raw_data/get_data.sh
@ -16,10 +16,26 @@
 # 1. size of pos : neg = 1:1.
 # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
 # 3. distinct train set and test set.
-# 4. build dict

 set -e

+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+# Download data
+echo "Downloading Amazon Electronics reviews data..."
+# http://jmcauley.ucsd.edu/data/amazon/
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+echo "Downloading mosesdecoder..."
+# https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+unzip master.zip
+rm master.zip
+
+##################
+# Preprocess data 
+echo "Preprocess data..."
 export LC_ALL=C
 UNAME_STR=`uname`

@ -29,11 +45,11 @@ else
  SHUF_PROG='gshuf'
 fi

-mkdir -p data/tmp
-python preprocess.py -i data/reviews_Electronics_5.json.gz
+mkdir -p tmp
+python preprocess.py -i reviews_Electronics_5.json.gz
 # uniq and shuffle
-cd data/tmp
-echo 'uniq and shuffle...'
+cd tmp
+echo 'Uniq and shuffle...'
 cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
 cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed

@ -53,11 +69,11 @@ cat train.pos train.neg | ${SHUF_PROG} >../train.txt
 cat test.pos test.neg | ${SHUF_PROG} >../test.txt

 cd -
-echo 'data/train.txt' > data/train.list
-echo 'data/test.txt' > data/test.list
+echo 'train.txt' > train.list
+echo 'test.txt' > test.list

 # use 30k dict
-rm -rf data/tmp
-mv data/dict.txt data/dict_all.txt
-cat data/dict_all.txt | head -n 30001 > data/dict.txt
-echo 'preprocess finished'
+rm -rf tmp
+mv dict.txt dict_all.txt
+cat dict_all.txt | head -n 30001 > dict.txt
+echo 'Done.'
--- a/demo/quick_start/data/proc_from_raw_data/preprocess.py
+++ b/demo/quick_start/data/proc_from_raw_data/preprocess.py
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-1. (remove HTML before or not)tokensizing
+1. Tokenize the words and punctuation 
 2. pos sample : rating score 5; neg sample: rating score 1-2.

 Usage:
@ -76,7 +76,11 @@ def tokenize(sentences):
    sentences : a list of input sentences.
    return: a list of processed text.
    """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    if not os.path.exists(dir):
+        sys.exit(
+            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
+        )
    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
    assert isinstance(sentences, list)
    text = "\n".join(sentences)
@ -104,7 +108,7 @@ def tokenize_batch(id):
        num_batch, instance, pre_fix = parse_queue.get()
        if num_batch == -1:  ### parse_queue finished
            tokenize_queue.put((-1, None, None))
-            sys.stderr.write("tokenize theread %s finish\n" % (id))
+            sys.stderr.write("Thread %s finish\n" % (id))
            break
        tokenize_instance = tokenize(instance)
        tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@ -14,10 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@ -25,12 +25,13 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
    #all inputs are integral and sequential type
    settings.slots = [
        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
        integer_value_sequence(len(word_dict)),
        integer_value_sequence(len(word_dict)),
        integer_value_sequence(len(word_dict)),
        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)), integer_value_sequence(2),
+        integer_value_sequence(len(word_dict)), 
+        integer_value_sequence(len(predicate_dict)),
+        integer_value_sequence(2),
        integer_value_sequence(len(label_dict))
    ]

@ -63,5 +64,5 @@ def process(settings, file_name):

            label_list = label.split()
            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+            yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@ -55,18 +55,14 @@ class Prediction():

        slots = [
            integer_value_sequence(len_dict),
-            integer_value_sequence(len_pred),
            integer_value_sequence(len_dict),
            integer_value_sequence(len_dict),
            integer_value_sequence(len_dict),
            integer_value_sequence(len_dict),
            integer_value_sequence(len_dict), 
+            integer_value_sequence(len_pred),
            integer_value_sequence(2)
            ]
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(2)
-        ]
        self.converter = DataProviderConverter(slots)

    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
@ -104,8 +100,8 @@ class Prediction():
                marks = mark.split()
                mark_slot = [int(w) for w in marks]
                
-                yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot,  mark_slot
+                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot

    def predict(self, data_file, output_file):
        """
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
+  sort -n | head -n 1
 }   

 log=train.log
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }

 log=train.log
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@ -17,7 +17,7 @@ set -e
 function get_best_pass() {
  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
  sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }

 log=train.log
--- a/demo/seqToseq/data/paraphrase_data.sh
+++ b/demo/seqToseq/data/paraphrase_data.sh
@ -16,9 +16,7 @@ set -e
 set -x

 # download the in-house paraphrase dataset
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz

 # untar the dataset
 tar -zxvf paraphrase.tar.gz
--- a/demo/seqToseq/data/wmt14_model.sh
+++ b/demo/seqToseq/data/wmt14_model.sh
@ -16,9 +16,7 @@ set -e
 set -x

 # download the pretrained model
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz

 # untar the model
 tar -zxvf wmt14_model.tar.gz
--- a/doc/about/index.rst
+++ b/doc/about/index.rst
@ -0,0 +1,14 @@
+ABOUT
+=======
+
+PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
+which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
+
+PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
+We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
+
+
+Credits
+--------
+
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
--- a/doc/algorithm/index.rst
+++ b/doc/algorithm/index.rst
@ -1,7 +0,0 @@
-Algorithm Tutorial
-==================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn.rst
--- a/doc/algorithm/rnn/bi_lstm.jpg
+++ b/doc/algorithm/rnn/bi_lstm.jpg
@ -1 +0,0 @@
-../../demo/sentiment_analysis/bi_lstm.jpg
--- a/doc/algorithm/rnn/encoder-decoder-attention-model.png
+++ b/doc/algorithm/rnn/encoder-decoder-attention-model.png
@ -1 +0,0 @@
-../../demo/text_generation/encoder-decoder-attention-model.png
--- a/doc/api/data_provider/index.rst
+++ b/doc/api/data_provider/index.rst
@ -1,5 +1,5 @@
-DataProvider Introduction
-=========================
+Introduction
+==============
 DataProvider is a module that loads training or testing data into cpu or gpu
 memory for the following triaining or testing process.

--- a/doc/api/data_provider/pydataprovider2.rst
+++ b/doc/api/data_provider/pydataprovider2.rst
@ -1,5 +1,5 @@
-How to use PyDataProvider2
-==========================
+PyDataProvider2
+=================

 We highly recommand users to use PyDataProvider2 to provide training or testing
 data to PaddlePaddle. The user only needs to focus on how to read a single
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`../../demo/text_generation/encoder-decoder-attention-model.png`