增加垃圾词语分类

pull/1/head
lidapeng 5 years ago
parent 395718cd11
commit de0ed10dc7

@ -6,8 +6,8 @@
<sourceOutputDir name="target/generated-sources/annotations" /> <sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" /> <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" /> <outputRelativeToContentRoot value="true" />
<module name="myBrain" />
<module name="ImageMarket" /> <module name="ImageMarket" />
<module name="myBrain" />
</profile> </profile>
</annotationProcessing> </annotationProcessing>
</component> </component>

@ -6,7 +6,7 @@
<groupId>com.github</groupId> <groupId>com.github</groupId>
<artifactId>ImageMarket</artifactId> <artifactId>ImageMarket</artifactId>
<version>1.0.0</version> <version>1.0.2</version>
<name>myBrain</name> <name>myBrain</name>
<!-- FIXME change it to the project's website --> <!-- FIXME change it to the project's website -->

@ -2,6 +2,7 @@ package org.wlld.naturalLanguage;
import org.wlld.randomForest.RandomForest; import org.wlld.randomForest.RandomForest;
import org.wlld.tools.ArithUtil;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -16,7 +17,8 @@ public class Talk {
private RandomForest randomForest = WordTemple.get().getRandomForest();//获取随机森林模型 private RandomForest randomForest = WordTemple.get().getRandomForest();//获取随机森林模型
private List<List<String>> wordTimes = WordTemple.get().getWordTimes(); private List<List<String>> wordTimes = WordTemple.get().getWordTimes();
public void talk(String sentence) throws Exception { public List<Integer> talk(String sentence) throws Exception {
List<Integer> typeList = new ArrayList<>();
String rgm = null; String rgm = null;
if (sentence.indexOf(",") > -1) { if (sentence.indexOf(",") > -1) {
rgm = ","; rgm = ",";
@ -41,15 +43,22 @@ public class Talk {
if (randomForest != null) { if (randomForest != null) {
for (Sentence sentence1 : sentences) { for (Sentence sentence1 : sentences) {
List<Integer> features = sentence1.getFeatures(); List<Integer> features = sentence1.getFeatures();
List<String> keyWords = sentence1.getKeyWords(); List<String> keyWords = sentence1.getKeyWords();//拆分的关键词
int wrong = 0;
int wordNumber = keyWords.size();
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
int nub = 0; int nub = 0;
if (keyWords.size() > i) { if (keyWords.size() > i) {
List<String> words = wordTimes.get(i); List<String> words = wordTimes.get(i);
nub = getNub(words, keyWords.get(i)); nub = getNub(words, keyWords.get(i));
if (nub == 0) {//出现了不认识的词
wrong++;
}
} }
features.add(nub); features.add(nub);
} }
int type = 0;
if (ArithUtil.div(wrong, wordNumber) < WordTemple.get().getGarbageTh()) {
LangBody langBody = new LangBody(); LangBody langBody = new LangBody();
langBody.setA1(features.get(0)); langBody.setA1(features.get(0));
langBody.setA2(features.get(1)); langBody.setA2(features.get(1));
@ -59,11 +68,13 @@ public class Talk {
langBody.setA6(features.get(5)); langBody.setA6(features.get(5));
langBody.setA7(features.get(6)); langBody.setA7(features.get(6));
langBody.setA8(features.get(7)); langBody.setA8(features.get(7));
int type = randomForest.forest(langBody); type = randomForest.forest(langBody);
System.out.println("type==" + type); }
typeList.add(type);
} }
return typeList;
} else { } else {
System.out.println("随机森林没有训练"); throw new Exception("forest is not study");
} }
} }

@ -69,7 +69,7 @@ public class Tokenizer extends Frequency {
DataTable dataTable = new DataTable(column); DataTable dataTable = new DataTable(column);
dataTable.setKey("key"); dataTable.setKey("key");
//初始化随机森林 //初始化随机森林
RandomForest randomForest = new RandomForest(5); RandomForest randomForest = new RandomForest(7);
WordTemple.get().setRandomForest(randomForest);//保存随机森林到模版 WordTemple.get().setRandomForest(randomForest);//保存随机森林到模版
randomForest.init(dataTable); randomForest.init(dataTable);
for (Sentence sentence : sentences) { for (Sentence sentence : sentences) {

@ -2,19 +2,9 @@ package org.wlld.naturalLanguage;
public class WordConst { public class WordConst {
public static double Word_Noise = 0.7;//收缩程度 public static double Word_Noise = 0.7;//收缩程度
public static final int SHOP = 1;//购买类型 public static final int Water = 2;//送水
public static final int FOOD = 3;//食物类型 public static final int Nanny = 3;//保姆
public static final int DRINK = 4;//饮品类型 public static final int Unlock = 4;//开锁
public static final int OTHER = 5;//家庭日用(油盐酱醋卫生纸之类的) public static final int Express = 5;//快递
public static final int SMOKE = 10;//烟草 public static final int Telephone = 6;//充话费
public static final int ADD = 6;//订单增0.5037412492
public static final int DEL = 7;//订单删
public static final int UPDATE = 8;//订单改
public static final int SELECT = 9;//订单查
public static final int TALK = 2;//聊天类型
public static final int ALL = 11;//全文本
public static final int CHANGE = 12;//分类文本
public static final int DROP = 13;//消文本
public static final int CURD = 14;//对订单增删改查类型
public static final int ANS = 0;//聊天回复
} }

@ -16,6 +16,15 @@ public class WordTemple {
private List<WorldBody> allWorld = new ArrayList<>();//所有词集合 private List<WorldBody> allWorld = new ArrayList<>();//所有词集合
private List<List<String>> wordTimes = new ArrayList<>();//词编号 private List<List<String>> wordTimes = new ArrayList<>();//词编号
private RandomForest randomForest;//保存的随机森林模型 private RandomForest randomForest;//保存的随机森林模型
private double garbageTh = 0.5;//垃圾分类的阈值默认0.7
public double getGarbageTh() {
return garbageTh;
}
public void setGarbageTh(double garbageTh) {
this.garbageTh = garbageTh;
}
public RandomForest getRandomForest() { public RandomForest getRandomForest() {
return randomForest; return randomForest;

@ -48,6 +48,7 @@ public class RandomForest {
for (int i = 0; i < forest.length; i++) { for (int i = 0; i < forest.length; i++) {
Tree tree = forest[i]; Tree tree = forest[i];
int type = tree.judge(object); int type = tree.judge(object);
//System.out.println(type);
if (map.containsKey(type)) { if (map.containsKey(type)) {
map.put(type, map.get(type) + 1); map.put(type, map.get(type) + 1);
} else { } else {
@ -81,7 +82,7 @@ public class RandomForest {
public void study() throws Exception {//学习 public void study() throws Exception {//学习
for (int i = 0; i < forest.length; i++) { for (int i = 0; i < forest.length; i++) {
System.out.println("开始学习==" + i + ",treeNub==" + forest.length); //System.out.println("开始学习==" + i + ",treeNub==" + forest.length);
Tree tree = forest[i]; Tree tree = forest[i];
tree.study(); tree.study();
} }

@ -4,6 +4,8 @@ import org.wlld.naturalLanguage.IOConst;
import org.wlld.naturalLanguage.Talk; import org.wlld.naturalLanguage.Talk;
import org.wlld.naturalLanguage.TemplateReader; import org.wlld.naturalLanguage.TemplateReader;
import java.util.List;
/** /**
* @author lidapeng * @author lidapeng
* @description * @description
@ -16,11 +18,9 @@ public class LangTest {
public static void test() throws Exception { public static void test() throws Exception {
TemplateReader templateReader = new TemplateReader(); TemplateReader templateReader = new TemplateReader();
templateReader.read("/Users/lidapeng/Desktop/myDocment/a2.txt", "UTF-8", IOConst.NOT_WIN); templateReader.read("/Users/lidapeng/Desktop/myDocment/a1.txt", "UTF-8", IOConst.NOT_WIN);
Talk talk = new Talk(); Talk talk = new Talk();
talk.talk("我要吃面包"); List<Integer> list = talk.talk("我草尼玛");
talk.talk("我渴了"); System.out.println(list);
talk.talk("我要去看望你");
talk.talk("我买两盒烟");
} }
} }

Loading…
Cancel
Save