diff --git a/.idea/compiler.xml b/.idea/compiler.xml
index 6aa88ff..d280c68 100644
--- a/.idea/compiler.xml
+++ b/.idea/compiler.xml
@@ -6,8 +6,8 @@
-
+
diff --git a/src/main/java/org/wlld/naturalLanguage/IOConst.java b/src/main/java/org/wlld/naturalLanguage/IOConst.java
new file mode 100644
index 0000000..dad856d
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/IOConst.java
@@ -0,0 +1,10 @@
+package org.wlld.naturalLanguage;
+
+public class IOConst {
+ public static final byte TYPE_Symbol = 0x23;//#号键
+ public static final byte STOP_END = 10;
+ public static final byte STOP_NEXT = 13;
+ public static final byte WIN = 1;//windows系统
+ public static final byte NOT_WIN = 2;//非Windows系统
+ public static final byte CORE_Number = 6;//核心数
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/KeyWord.java b/src/main/java/org/wlld/naturalLanguage/KeyWord.java
new file mode 100644
index 0000000..038ea3e
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/KeyWord.java
@@ -0,0 +1,22 @@
+package org.wlld.naturalLanguage;
+
+public class KeyWord {
+ private Word word;//关键字
+ private boolean isOk;//是否完成此关键字
+
+ public Word getWord() {
+ return word;
+ }
+
+ public void setWord(Word word) {
+ this.word = word;
+ }
+
+ public boolean isOk() {
+ return isOk;
+ }
+
+ public void setOk(boolean ok) {
+ isOk = ok;
+ }
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/Sentence.java b/src/main/java/org/wlld/naturalLanguage/Sentence.java
new file mode 100644
index 0000000..4035cf8
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/Sentence.java
@@ -0,0 +1,73 @@
+package org.wlld.naturalLanguage;
+
+import org.omg.Messaging.SYNC_WITH_TRANSPORT;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author lidapeng
+ * @description
+ * @date 8:01 上午 2020/2/23
+ */
+public class Sentence {
+ private Word firstWord;
+ private List waitWords = new ArrayList<>();//词
+ private List keyWords;//分词结果下标按照时间序列排序
+ private int key;
+
+ public List getKeyWords() {
+ return keyWords;
+ }
+
+ public void setKeyWords(List keyWords) {
+ this.keyWords = keyWords;
+ }
+
+ public List getWaitWords() {
+ return waitWords;
+ }
+
+ public Word getFirstWord() {
+ return firstWord;
+ }
+
+ public Sentence() {
+
+ }
+
+ public Sentence(int key) {
+ if (key > -1) {
+ this.key = key;
+ }
+ }
+
+ public int getKey() {
+ return key;
+ }
+
+ public void setKey(int key) {
+ this.key = key;
+ }
+
+ private void lineWord(Word word, Word wordSon) {//给词连线
+ if (firstWord != null) {
+ if (word.getSon() != null) {//右连接不是空的
+ lineWord(word.getSon(), wordSon);
+ } else {//右连接是空的
+ wordSon.setLv(word.getLv() + 1);
+ word.setSon(wordSon);
+ }
+ } else {
+ firstWord = wordSon;
+ firstWord.setLv(1);
+ }
+ }
+
+ public void setWord(String word) {//编号
+ Word word1 = new Word();
+ word1.setWord(word);
+ lineWord(firstWord, word1);//词之间做连线
+ waitWords.add(word1);
+ }
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/Talk.java b/src/main/java/org/wlld/naturalLanguage/Talk.java
new file mode 100644
index 0000000..c1cb75d
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/Talk.java
@@ -0,0 +1,77 @@
+package org.wlld.naturalLanguage;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author lidapeng
+ * @description 语句分类
+ * @date 4:14 下午 2020/2/23
+ */
+public class Talk {
+ private List allWorld = WordTemple.get().getAllWorld();//所有词集合
+
+ public void talk(String sentence) {
+ String rgm = null;
+ if (sentence.indexOf(",") > -1) {
+ rgm = ",";
+ } else if (sentence.indexOf(",") > -1) {
+ rgm = ",";
+ }
+ String[] sens;
+ if (rgm != null) {
+ sens = sentence.split(rgm);
+ } else {
+ sens = new String[]{sentence};
+ }
+ //拆词
+ List sentences = new ArrayList<>();
+ for (int i = 0; i < sens.length; i++) {
+ Sentence sentenceWords = new Sentence();
+ catchSentence(sentence, sentenceWords);
+ sentences.add(sentenceWords);
+ }
+ restructure(sentences);
+ for (Sentence sentence1 : sentences) {
+ System.out.println(sentence1.getKeyWords());
+ }
+ }
+
+ private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开
+ int len = sentence.length();
+ for (int i = 0; i < len; i++) {
+ String word = sentence.substring(0, i + 1);
+ sentenceWords.setWord(word);
+ }
+
+ }
+
+ private void restructure(List sentences) {//对句子里面的Word进行词频统计
+ for (Sentence words : sentences) {
+ List listWord = allWorld;
+ List waitWorld = words.getWaitWords();
+ for (Word word : waitWorld) {
+ String myWord = word.getWord();
+ WorldBody body = getBody(myWord, listWord);
+ listWord = body.getWorldBodies();
+ word.setWordFrequency(body.getWordFrequency());
+ }
+ }
+ Tokenizer tokenizer = new Tokenizer();
+ for (Sentence words : sentences) {
+ tokenizer.radiation(words);
+ }
+ }
+
+ private WorldBody getBody(String word, List worlds) {
+ WorldBody myBody = null;
+ for (WorldBody body : worlds) {
+ if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) {
+ myBody = body;
+ break;
+ }
+ }
+ return myBody;
+ }
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/TemplateReader.java b/src/main/java/org/wlld/naturalLanguage/TemplateReader.java
new file mode 100644
index 0000000..02212c9
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/TemplateReader.java
@@ -0,0 +1,81 @@
+package org.wlld.naturalLanguage;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.*;
+
+public class TemplateReader {//模板读取类
+ private Map> model = new HashMap<>();//训练模板
+ private String charsetName;
+
+ public void read(String url, String charsetName, byte sys) throws Exception {
+ this.charsetName = charsetName;
+ File file = new File(url);
+ InputStream is = new FileInputStream(file);
+ int i;
+ LinkedList span = new LinkedList<>();
+ int hang = 0;
+ int again = 0;
+ int upNub = 0;
+ boolean isSymbol = false;//是否遇到分隔符
+ while ((i = is.read()) > -1) {
+ if (i == IOConst.TYPE_Symbol) {//遇到分隔符号
+ isSymbol = true;
+ } else {
+ if (i == IOConst.STOP_END || i == IOConst.STOP_NEXT) {
+ isSymbol = false;
+ again = again << 1 | 1;
+ if (again == 1) {//第一次进入
+ List lr = model.get(upNub);
+ //addEnd(span);
+ if (lr != null) {
+ lr.add(LinkToString(span));
+ } else {
+ List lis = new ArrayList<>();
+ lis.add(LinkToString(span));
+ model.put(upNub, lis);
+ }
+ upNub = 0;
+ hang++;
+ if (sys != IOConst.WIN) {
+ again = 0;
+ }
+ } else {
+ again = 0;
+ }
+ } else {
+ if (isSymbol) {
+ int type = i;
+ if (type >= 48 && type <= 57) {
+ type = type - 48;
+ if (upNub == 0) {
+ upNub = type;
+ } else {
+ upNub = upNub * 10 + type;
+ }
+ }
+ } else {
+ span.add((byte) i);
+ }
+ }
+ }
+ }
+ word();
+ }
+
+ public void word() {
+ Tokenizer tokenizer = new Tokenizer();
+ tokenizer.start(model);
+ }
+
+ public String LinkToString(LinkedList mod) throws UnsupportedEncodingException {
+ int b = mod.size();
+ byte[] be = new byte[b];
+ for (int i = 0; i < b; i++) {
+ be[i] = mod.poll();
+ }
+ return new String(be, charsetName);
+ }
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/Tokenizer.java b/src/main/java/org/wlld/naturalLanguage/Tokenizer.java
new file mode 100644
index 0000000..5481285
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/Tokenizer.java
@@ -0,0 +1,204 @@
+package org.wlld.naturalLanguage;
+
+import org.wlld.tools.ArithUtil;
+import org.wlld.tools.Frequency;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author lidapeng
+ * @description 分词器
+ * @date 7:42 上午 2020/2/23
+ */
+public class Tokenizer extends Frequency {
+ private List sentences = WordTemple.get().getSentences();//所有断句
+ private List allWorld = WordTemple.get().getAllWorld();//所有词集合
+ private Word nowWord;//上一次出现的关键字
+
+ public void start(Map> model) {
+ //model的主键是类别,值是该类别语句的集合
+ for (Map.Entry> mod : model.entrySet()) {
+ if (mod.getKey() != 0) {
+ List st = mod.getValue();//语句
+ int key = mod.getKey();//类别
+ for (String sentence : st) {//遍历每个类别的每个语句
+ Sentence sentenceWords = new Sentence(key);
+ catchSentence(sentence, sentenceWords);
+ Word word = sentenceWords.getFirstWord();
+ if (word != null) {
+ worldMuch(word, allWorld, key);//构建句子内的层级关系并添加词频
+ }
+ }
+ }
+ }
+ restructure();//对集合中的词进行词频统计
+ //这里分词已经结束,对词进行编号
+ //test();
+ }
+
+ private void test() {//分词测试类
+ for (Sentence sentence : sentences) {
+ System.out.println(sentence.getKeyWords());
+ }
+ }
+
+ private void restructure() {//对句子里面的Word进行词频统计
+ for (Sentence words : sentences) {
+ List listWord = allWorld;
+ List waitWorld = words.getWaitWords();
+ for (Word word : waitWorld) {
+ String myWord = word.getWord();
+ WorldBody body = getBody(myWord, listWord);
+ listWord = body.getWorldBodies();
+ word.setWordFrequency(body.getWordFrequency());
+ }
+ }
+ for (Sentence words : sentences) {
+ radiation(words);
+ }
+ }
+
+ public void radiation(Sentence sentenceWords) {//对句子中的词开始辐射延伸
+ //首先词与它自己的右节点和左节点进行比较
+ nowWord = null;
+ Word firstWord = sentenceWords.getFirstWord();
+ KeyWord word = new KeyWord();
+ word.setWord(firstWord);
+ word.setOk(false);
+ List keyWords = new ArrayList<>();
+ while (word.getWord() != null) {
+ word = keyWord(-1, word, new double[]{firstWord.getWordFrequency()});
+ Word myWord = word.getWord();
+ String wordT = myWord.getWord();//当前截取到的分词串
+ String keyWord;
+ if (nowWord == null) {//这句话的第一个分词还没有产生*****
+ //此时的分词结果就是WordT
+ keyWord = wordT;
+ } else {//之前产生了分词
+ keyWord = wordT.substring(nowWord.getWord().length());
+ }
+ keyWords.add(keyWord);
+ nowWord = myWord;
+ word.setOk(false);
+ word.setWord(word.getWord().getSon());
+ }
+ sentenceWords.setKeyWords(keyWords);
+ }
+
+ private double[] getDiff(double[] diff, Word word) {
+ double[] diffef = new double[diff.length + 1];
+ for (int i = 0; i < diffef.length; i++) {
+ if (i == diffef.length - 1) {
+ diffef[i] = word.getWordFrequency();
+ } else {
+ diffef[i] = diff[i];
+ }
+ }
+ return diffef;
+ }
+
+ private KeyWord keyWord(double dm, KeyWord words, double[] diff) {//平均差值,离散系数,是否为关键字
+ double right = 0;
+ boolean bm = words.isOk();
+ if (!bm) {
+ Word word = words.getWord();
+ if (word.getSon() != null) {
+ double db = wordEnd(word, new ArrayList<>(), 0);//计算身前平均值
+ //与它儿子词频的差要小于辐射向前的词频差的平均值
+ boolean isAvgOk = (ArithUtil.mul(word.getWordFrequency() - word.getSon().getWordFrequency(), WordConst.Word_Noise)) <= db;
+ if (isAvgOk) {//平均值检测
+ diff = getDiff(diff, word.getSon());
+ right = dc(diff);
+ if (dm > -1) {
+ if (ArithUtil.mul(right, WordConst.Word_Noise) <= dm) {//继续向下探索
+ words.setOk(false);
+ words.setWord(word.getSon());
+ words = keyWord(right, words, diff);
+ } else {//截断,停止探索
+ words.setOk(true);//是关键字
+ }
+ } else {//第一次 继续向下探索
+ words.setOk(false);
+ words.setWord(word.getSon());
+ words = keyWord(right, words, diff);
+ }
+ } else {//截断 停止探索
+ words.setOk(true);
+ }
+ } else {//截断 停止探索
+ words.setOk(true);
+ }
+ }
+ return words;
+ }
+
+ private double wordEnd(Word word, List av, double a) {//对一句话中的词进行处理
+ //先取全句平均差值
+ Word son = word.getSon();
+ if (son != null) {
+ av.add(word.getWordFrequency() - son.getWordFrequency());
+ a = wordEnd(son, av, a);
+ } else {//最后计算平均值
+ double[] allNub = new double[av.size()];
+ for (int i = 0; i < av.size(); i++) {
+ allNub[i] = av.get(i);
+ }
+ a = average(allNub);//平均差值
+ }
+ return a;
+ }
+
+ private WorldBody getBody(String word, List worlds) {
+ WorldBody myBody = null;
+ for (WorldBody body : worlds) {
+ if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) {
+ myBody = body;
+ break;
+ }
+ }
+ return myBody;
+ }
+
+ private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开
+ int len = sentence.length();
+ for (int i = 0; i < len; i++) {
+ String word = sentence.substring(0, i + 1);
+ sentenceWords.setWord(word);
+ }
+ sentences.add(sentenceWords);
+ }
+
+ private void worldMuch(Word word, List worldBodies, int type) {//分类词频处理
+ boolean bm = false;
+ String check = word.getWord();
+ for (WorldBody myWorld : worldBodies) {
+ String waitCheck = myWorld.getWordName();
+ if (waitCheck.hashCode() == check.hashCode() && waitCheck.equals(check)) {
+ bm = true;
+ myWorld.addNub(type);
+ if (word.getSon() != null) {//没有找到最后一级了
+ worldMuch(word.getSon(), myWorld.getWorldBodies(), type);
+ }
+ break;
+ }
+ }
+ if (!bm) {//找不到了
+ saveList(word, worldBodies, type);
+ }
+ }
+
+ private void saveList(Word word, List myWorld, int type) {//保存新词
+ WorldBody body = new WorldBody();
+ List list = new ArrayList<>();
+ body.setWordName(word.getWord());
+ body.addNub(type);
+ body.setWorldBodies(list);
+ body.setWord(word);
+ myWorld.add(body);
+ if (word.getSon() != null) {
+ saveList(word.getSon(), list, type);
+ }
+ }
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/Word.java b/src/main/java/org/wlld/naturalLanguage/Word.java
new file mode 100644
index 0000000..0fa610f
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/Word.java
@@ -0,0 +1,41 @@
+package org.wlld.naturalLanguage;
+
+
+public class Word {
+ private String word;
+ private Word son;
+ private int wordFrequency;//词频
+ private int lv;//该词的时间序列
+
+ public int getLv() {
+ return lv;
+ }
+
+ public void setLv(int lv) {
+ this.lv = lv;
+ }
+
+ public String getWord() {
+ return word;
+ }
+
+ public void setWord(String word) {
+ this.word = word;
+ }
+
+ public Word getSon() {
+ return son;
+ }
+
+ public void setSon(Word son) {
+ this.son = son;
+ }
+
+ public int getWordFrequency() {
+ return wordFrequency;
+ }
+
+ public void setWordFrequency(int wordFrequency) {
+ this.wordFrequency = wordFrequency;
+ }
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/WordConst.java b/src/main/java/org/wlld/naturalLanguage/WordConst.java
new file mode 100644
index 0000000..27e0a15
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/WordConst.java
@@ -0,0 +1,5 @@
+package org.wlld.naturalLanguage;
+
+public class WordConst {
+ public static double Word_Noise = 0.7;//收缩程度
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/WordTemple.java b/src/main/java/org/wlld/naturalLanguage/WordTemple.java
new file mode 100644
index 0000000..8a9df9f
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/WordTemple.java
@@ -0,0 +1,38 @@
+package org.wlld.naturalLanguage;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author lidapeng
+ * @description分词模版
+ * @date 4:15 下午 2020/2/23
+ */
+public class WordTemple {
+ private static WordTemple Word_Temple = new WordTemple();
+ private List sentences = new ArrayList<>();//所有断句
+ private List allWorld = new ArrayList<>();//所有词集合
+
+ private WordTemple() {
+ }
+
+ public static WordTemple get() {
+ return Word_Temple;
+ }
+
+ public List getSentences() {
+ return sentences;
+ }
+
+ public void setSentences(List sentences) {
+ this.sentences = sentences;
+ }
+
+ public List getAllWorld() {
+ return allWorld;
+ }
+
+ public void setAllWorld(List allWorld) {
+ this.allWorld = allWorld;
+ }
+}
diff --git a/src/main/java/org/wlld/naturalLanguage/WorldBody.java b/src/main/java/org/wlld/naturalLanguage/WorldBody.java
new file mode 100644
index 0000000..bf462d7
--- /dev/null
+++ b/src/main/java/org/wlld/naturalLanguage/WorldBody.java
@@ -0,0 +1,46 @@
+package org.wlld.naturalLanguage;
+
+import java.util.List;
+
+public class WorldBody {
+ private String wordName;//词
+ private int wordFrequency;//词频
+ private List worldBodies;//辐射集合
+ private Word word;
+ private int type = 0;
+
+ public String getWordName() {
+ return wordName;
+ }
+
+ public void setWordName(String wordName) {
+ this.wordName = wordName;
+ }
+
+ public int getWordFrequency() {
+ return wordFrequency;
+ }
+
+ public void addNub(int type) {
+ if (this.type != 0) {
+ this.type = type;
+ }
+ wordFrequency++;
+ }
+
+ public List getWorldBodies() {
+ return worldBodies;
+ }
+
+ public void setWorldBodies(List worldBodies) {
+ this.worldBodies = worldBodies;
+ }
+
+ public Word getWord() {
+ return word;
+ }
+
+ public void setWord(Word word) {
+ this.word = word;
+ }
+}
diff --git a/src/main/java/org/wlld/randomForest/RandomForest.java b/src/main/java/org/wlld/randomForest/RandomForest.java
index b7e1145..7e4f97d 100644
--- a/src/main/java/org/wlld/randomForest/RandomForest.java
+++ b/src/main/java/org/wlld/randomForest/RandomForest.java
@@ -79,14 +79,14 @@ public class RandomForest {
}
}
- public void study() throws Exception {
+ public void study() throws Exception {//学习
for (int i = 0; i < forest.length; i++) {
Tree tree = forest[i];
tree.study();
}
}
- public void insert(Object object) {
+ public void insert(Object object) {//添加学习参数
for (int i = 0; i < forest.length; i++) {
Tree tree = forest[i];
tree.getDataTable().insert(object);
diff --git a/src/main/java/org/wlld/tools/Frequency.java b/src/main/java/org/wlld/tools/Frequency.java
new file mode 100644
index 0000000..eb77b24
--- /dev/null
+++ b/src/main/java/org/wlld/tools/Frequency.java
@@ -0,0 +1,53 @@
+package org.wlld.tools;
+
+public abstract class Frequency {//统计频数
+
+ public double average(double... m) {//计算平均值
+ int len = m.length;
+ double allNub = 0;
+ for (int i = 0; i < len; i++) {
+ allNub = allNub + m[i];
+ }
+ allNub = ArithUtil.div(allNub, len);
+ return allNub;
+ }
+
+ public double getPointLength(double x, double y, double i, double j) {//获取两个二维坐标之间的欧式距离
+ return Math.sqrt(ArithUtil.add(Math.pow(ArithUtil.sub(x, i), 2), Math.pow(ArithUtil.sub(y, j), 2)));
+ }
+
+ public double variance(double... m) {//计算方差
+ double ave = average(m);//先计算出平均值
+ double allNub = 0;
+ for (int i = 0; i < m.length; i++) {
+ allNub = allNub + Math.pow(m[i] - ave, 2);
+ }
+ double var = ArithUtil.div(allNub, m.length);
+ return var;
+ }
+
+ public double sd(double... m) {//计算标准差
+ double var = variance(m);
+ return Math.sqrt(var);
+ }
+
+ public double dc(double... m) {//计算离散系数
+ double ave = average(m);//先计算出平均值
+ double allNub = 0;
+ for (int i = 0; i < m.length; i++) {
+ allNub = allNub + Math.pow(m[i] - ave, 2);
+ }
+ double dc = ArithUtil.div(Math.sqrt(ArithUtil.div(allNub, m.length)), ave);//离散系数
+ return dc;
+ }
+
+ public double softMax(int t, double... m) {//下标和数组
+ double my = Math.exp(m[t]);
+ double all = 0.0;
+ int allLength = m.length;
+ for (int i = 0; i < allLength; i++) {
+ all = all + Math.exp(m[i]);
+ }
+ return ArithUtil.div(my, all);
+ }
+}
diff --git a/src/test/java/org/wlld/LangTest.java b/src/test/java/org/wlld/LangTest.java
new file mode 100644
index 0000000..47b776b
--- /dev/null
+++ b/src/test/java/org/wlld/LangTest.java
@@ -0,0 +1,23 @@
+package org.wlld;
+
+import org.wlld.naturalLanguage.IOConst;
+import org.wlld.naturalLanguage.Talk;
+import org.wlld.naturalLanguage.TemplateReader;
+
+/**
+ * @author lidapeng
+ * @description
+ * @date 2:07 下午 2020/2/23
+ */
+public class LangTest {
+ public static void main(String[] args) throws Exception {
+ test();
+ }
+
+ public static void test() throws Exception {
+ TemplateReader templateReader = new TemplateReader();
+ templateReader.read("/Users/lidapeng/Desktop/myDocment/a.txt", "UTF-8", IOConst.NOT_WIN);
+ Talk talk = new Talk();
+ talk.talk("我要吃面包");
+ }
+}