speedup preprocess in quick start

ISSUE=4575209 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1430 1ad973e4-5ce8-4261-8a94-b56d1f490c56
9 years ago · 5db9e5900f
parent 2afe66093a
commit 5db9e5900f
2 changed files with 181 additions and 104 deletions
--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/preprocess.py
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -12,10 +12,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e

+# 1. size of pos : neg = 1:1.
+# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
+# 3. distinct train set and test set.
+# 4. build dict
+
+
+mkdir data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
+# uniq and shuffle
+cd data/tmp
+cat pos_*|sort|uniq|shuf> pos.shuffed
+cat neg_*|sort|uniq|shuf> neg.shuffed
+
+min_len=`sed -n '$=' neg.shuffed`
+((test_num=$min_len/10))
+if [ $test_num -gt 12500 ];then
+ test_num=12500
+fi
+((train_num=$min_len-$test_num))
+
+head -n$train_num pos.shuffed >train.pos
+head -n$train_num neg.shuffed >train.neg
+tail -n$test_num pos.shuffed >test.pos
+tail -n$test_num neg.shuffed >test.neg
+
+cat train.pos train.neg|shuf>../train.txt
+cat test.pos test.neg|shuf>../test.txt
+
+cd -
+echo 'data/train.txt' > data/train.list
+echo 'data/test.txt' > data/test.list

 # use 30k dict
+rm -rf data/tmp
 mv data/dict.txt data/dict_all.txt
 cat data/dict_all.txt | head -n 30001 > data/dict.txt
+echo 'preprocess finished'