增加垃圾词语分类

pull/1/head
lidapeng 5 years ago
parent 395718cd11
commit de0ed10dc7

@ -6,8 +6,8 @@
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="myBrain" />
<module name="ImageMarket" />
<module name="myBrain" />
</profile>
</annotationProcessing>
</component>

@ -6,7 +6,7 @@
<groupId>com.github</groupId>
<artifactId>ImageMarket</artifactId>
<version>1.0.0</version>
<version>1.0.2</version>
<name>myBrain</name>
<!-- FIXME change it to the project's website -->

@ -2,6 +2,7 @@ package org.wlld.naturalLanguage;
import org.wlld.randomForest.RandomForest;
import org.wlld.tools.ArithUtil;
import java.util.ArrayList;
import java.util.List;
@ -16,7 +17,8 @@ public class Talk {
private RandomForest randomForest = WordTemple.get().getRandomForest();//获取随机森林模型
private List<List<String>> wordTimes = WordTemple.get().getWordTimes();
public void talk(String sentence) throws Exception {
public List<Integer> talk(String sentence) throws Exception {
List<Integer> typeList = new ArrayList<>();
String rgm = null;
if (sentence.indexOf(",") > -1) {
rgm = ",";
@ -41,29 +43,38 @@ public class Talk {
if (randomForest != null) {
for (Sentence sentence1 : sentences) {
List<Integer> features = sentence1.getFeatures();
List<String> keyWords = sentence1.getKeyWords();
List<String> keyWords = sentence1.getKeyWords();//拆分的关键词
int wrong = 0;
int wordNumber = keyWords.size();
for (int i = 0; i < 8; i++) {
int nub = 0;
if (keyWords.size() > i) {
List<String> words = wordTimes.get(i);
nub = getNub(words, keyWords.get(i));
if (nub == 0) {//出现了不认识的词
wrong++;
}
}
features.add(nub);
}
LangBody langBody = new LangBody();
langBody.setA1(features.get(0));
langBody.setA2(features.get(1));
langBody.setA3(features.get(2));
langBody.setA4(features.get(3));
langBody.setA5(features.get(4));
langBody.setA6(features.get(5));
langBody.setA7(features.get(6));
langBody.setA8(features.get(7));
int type = randomForest.forest(langBody);
System.out.println("type==" + type);
int type = 0;
if (ArithUtil.div(wrong, wordNumber) < WordTemple.get().getGarbageTh()) {
LangBody langBody = new LangBody();
langBody.setA1(features.get(0));
langBody.setA2(features.get(1));
langBody.setA3(features.get(2));
langBody.setA4(features.get(3));
langBody.setA5(features.get(4));
langBody.setA6(features.get(5));
langBody.setA7(features.get(6));
langBody.setA8(features.get(7));
type = randomForest.forest(langBody);
}
typeList.add(type);
}
return typeList;
} else {
System.out.println("随机森林没有训练");
throw new Exception("forest is not study");
}
}

@ -69,7 +69,7 @@ public class Tokenizer extends Frequency {
DataTable dataTable = new DataTable(column);
dataTable.setKey("key");
//初始化随机森林
RandomForest randomForest = new RandomForest(5);
RandomForest randomForest = new RandomForest(7);
WordTemple.get().setRandomForest(randomForest);//保存随机森林到模版
randomForest.init(dataTable);
for (Sentence sentence : sentences) {

@ -2,19 +2,9 @@ package org.wlld.naturalLanguage;
public class WordConst {
public static double Word_Noise = 0.7;//收缩程度
public static final int SHOP = 1;//购买类型
public static final int FOOD = 3;//食物类型
public static final int DRINK = 4;//饮品类型
public static final int OTHER = 5;//家庭日用(油盐酱醋卫生纸之类的)
public static final int SMOKE = 10;//烟草
public static final int ADD = 6;//订单增0.5037412492
public static final int DEL = 7;//订单删
public static final int UPDATE = 8;//订单改
public static final int SELECT = 9;//订单查
public static final int TALK = 2;//聊天类型
public static final int ALL = 11;//全文本
public static final int CHANGE = 12;//分类文本
public static final int DROP = 13;//消文本
public static final int CURD = 14;//对订单增删改查类型
public static final int ANS = 0;//聊天回复
public static final int Water = 2;//送水
public static final int Nanny = 3;//保姆
public static final int Unlock = 4;//开锁
public static final int Express = 5;//快递
public static final int Telephone = 6;//充话费
}

@ -16,6 +16,15 @@ public class WordTemple {
private List<WorldBody> allWorld = new ArrayList<>();//所有词集合
private List<List<String>> wordTimes = new ArrayList<>();//词编号
private RandomForest randomForest;//保存的随机森林模型
private double garbageTh = 0.5;//垃圾分类的阈值默认0.7
public double getGarbageTh() {
return garbageTh;
}
public void setGarbageTh(double garbageTh) {
this.garbageTh = garbageTh;
}
public RandomForest getRandomForest() {
return randomForest;

@ -48,6 +48,7 @@ public class RandomForest {
for (int i = 0; i < forest.length; i++) {
Tree tree = forest[i];
int type = tree.judge(object);
//System.out.println(type);
if (map.containsKey(type)) {
map.put(type, map.get(type) + 1);
} else {
@ -81,7 +82,7 @@ public class RandomForest {
public void study() throws Exception {//学习
for (int i = 0; i < forest.length; i++) {
System.out.println("开始学习==" + i + ",treeNub==" + forest.length);
//System.out.println("开始学习==" + i + ",treeNub==" + forest.length);
Tree tree = forest[i];
tree.study();
}

@ -4,6 +4,8 @@ import org.wlld.naturalLanguage.IOConst;
import org.wlld.naturalLanguage.Talk;
import org.wlld.naturalLanguage.TemplateReader;
import java.util.List;
/**
* @author lidapeng
* @description
@ -16,11 +18,9 @@ public class LangTest {
public static void test() throws Exception {
TemplateReader templateReader = new TemplateReader();
templateReader.read("/Users/lidapeng/Desktop/myDocment/a2.txt", "UTF-8", IOConst.NOT_WIN);
templateReader.read("/Users/lidapeng/Desktop/myDocment/a1.txt", "UTF-8", IOConst.NOT_WIN);
Talk talk = new Talk();
talk.talk("我要吃面包");
talk.talk("我渴了");
talk.talk("我要去看望你");
talk.talk("我买两盒烟");
List<Integer> list = talk.talk("我草尼玛");
System.out.println(list);
}
}

Loading…
Cancel
Save