diff --git a/.idea/compiler.xml b/.idea/compiler.xml index 6aa88ff..d280c68 100644 --- a/.idea/compiler.xml +++ b/.idea/compiler.xml @@ -6,8 +6,8 @@ - + diff --git a/src/main/java/org/wlld/naturalLanguage/IOConst.java b/src/main/java/org/wlld/naturalLanguage/IOConst.java new file mode 100644 index 0000000..dad856d --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/IOConst.java @@ -0,0 +1,10 @@ +package org.wlld.naturalLanguage; + +public class IOConst { + public static final byte TYPE_Symbol = 0x23;//#号键 + public static final byte STOP_END = 10; + public static final byte STOP_NEXT = 13; + public static final byte WIN = 1;//windows系统 + public static final byte NOT_WIN = 2;//非Windows系统 + public static final byte CORE_Number = 6;//核心数 +} diff --git a/src/main/java/org/wlld/naturalLanguage/KeyWord.java b/src/main/java/org/wlld/naturalLanguage/KeyWord.java new file mode 100644 index 0000000..038ea3e --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/KeyWord.java @@ -0,0 +1,22 @@ +package org.wlld.naturalLanguage; + +public class KeyWord { + private Word word;//关键字 + private boolean isOk;//是否完成此关键字 + + public Word getWord() { + return word; + } + + public void setWord(Word word) { + this.word = word; + } + + public boolean isOk() { + return isOk; + } + + public void setOk(boolean ok) { + isOk = ok; + } +} diff --git a/src/main/java/org/wlld/naturalLanguage/Sentence.java b/src/main/java/org/wlld/naturalLanguage/Sentence.java new file mode 100644 index 0000000..4035cf8 --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/Sentence.java @@ -0,0 +1,73 @@ +package org.wlld.naturalLanguage; + +import org.omg.Messaging.SYNC_WITH_TRANSPORT; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author lidapeng + * @description + * @date 8:01 上午 2020/2/23 + */ +public class Sentence { + private Word firstWord; + private List waitWords = new ArrayList<>();//词 + private List keyWords;//分词结果下标按照时间序列排序 + private int key; + + public List getKeyWords() { + return keyWords; + } + + public void setKeyWords(List keyWords) { + this.keyWords = keyWords; + } + + public List getWaitWords() { + return waitWords; + } + + public Word getFirstWord() { + return firstWord; + } + + public Sentence() { + + } + + public Sentence(int key) { + if (key > -1) { + this.key = key; + } + } + + public int getKey() { + return key; + } + + public void setKey(int key) { + this.key = key; + } + + private void lineWord(Word word, Word wordSon) {//给词连线 + if (firstWord != null) { + if (word.getSon() != null) {//右连接不是空的 + lineWord(word.getSon(), wordSon); + } else {//右连接是空的 + wordSon.setLv(word.getLv() + 1); + word.setSon(wordSon); + } + } else { + firstWord = wordSon; + firstWord.setLv(1); + } + } + + public void setWord(String word) {//编号 + Word word1 = new Word(); + word1.setWord(word); + lineWord(firstWord, word1);//词之间做连线 + waitWords.add(word1); + } +} diff --git a/src/main/java/org/wlld/naturalLanguage/Talk.java b/src/main/java/org/wlld/naturalLanguage/Talk.java new file mode 100644 index 0000000..c1cb75d --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/Talk.java @@ -0,0 +1,77 @@ +package org.wlld.naturalLanguage; + + +import java.util.ArrayList; +import java.util.List; + +/** + * @author lidapeng + * @description 语句分类 + * @date 4:14 下午 2020/2/23 + */ +public class Talk { + private List allWorld = WordTemple.get().getAllWorld();//所有词集合 + + public void talk(String sentence) { + String rgm = null; + if (sentence.indexOf(",") > -1) { + rgm = ","; + } else if (sentence.indexOf(",") > -1) { + rgm = ","; + } + String[] sens; + if (rgm != null) { + sens = sentence.split(rgm); + } else { + sens = new String[]{sentence}; + } + //拆词 + List sentences = new ArrayList<>(); + for (int i = 0; i < sens.length; i++) { + Sentence sentenceWords = new Sentence(); + catchSentence(sentence, sentenceWords); + sentences.add(sentenceWords); + } + restructure(sentences); + for (Sentence sentence1 : sentences) { + System.out.println(sentence1.getKeyWords()); + } + } + + private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开 + int len = sentence.length(); + for (int i = 0; i < len; i++) { + String word = sentence.substring(0, i + 1); + sentenceWords.setWord(word); + } + + } + + private void restructure(List sentences) {//对句子里面的Word进行词频统计 + for (Sentence words : sentences) { + List listWord = allWorld; + List waitWorld = words.getWaitWords(); + for (Word word : waitWorld) { + String myWord = word.getWord(); + WorldBody body = getBody(myWord, listWord); + listWord = body.getWorldBodies(); + word.setWordFrequency(body.getWordFrequency()); + } + } + Tokenizer tokenizer = new Tokenizer(); + for (Sentence words : sentences) { + tokenizer.radiation(words); + } + } + + private WorldBody getBody(String word, List worlds) { + WorldBody myBody = null; + for (WorldBody body : worlds) { + if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) { + myBody = body; + break; + } + } + return myBody; + } +} diff --git a/src/main/java/org/wlld/naturalLanguage/TemplateReader.java b/src/main/java/org/wlld/naturalLanguage/TemplateReader.java new file mode 100644 index 0000000..02212c9 --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/TemplateReader.java @@ -0,0 +1,81 @@ +package org.wlld.naturalLanguage; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.*; + +public class TemplateReader {//模板读取类 + private Map> model = new HashMap<>();//训练模板 + private String charsetName; + + public void read(String url, String charsetName, byte sys) throws Exception { + this.charsetName = charsetName; + File file = new File(url); + InputStream is = new FileInputStream(file); + int i; + LinkedList span = new LinkedList<>(); + int hang = 0; + int again = 0; + int upNub = 0; + boolean isSymbol = false;//是否遇到分隔符 + while ((i = is.read()) > -1) { + if (i == IOConst.TYPE_Symbol) {//遇到分隔符号 + isSymbol = true; + } else { + if (i == IOConst.STOP_END || i == IOConst.STOP_NEXT) { + isSymbol = false; + again = again << 1 | 1; + if (again == 1) {//第一次进入 + List lr = model.get(upNub); + //addEnd(span); + if (lr != null) { + lr.add(LinkToString(span)); + } else { + List lis = new ArrayList<>(); + lis.add(LinkToString(span)); + model.put(upNub, lis); + } + upNub = 0; + hang++; + if (sys != IOConst.WIN) { + again = 0; + } + } else { + again = 0; + } + } else { + if (isSymbol) { + int type = i; + if (type >= 48 && type <= 57) { + type = type - 48; + if (upNub == 0) { + upNub = type; + } else { + upNub = upNub * 10 + type; + } + } + } else { + span.add((byte) i); + } + } + } + } + word(); + } + + public void word() { + Tokenizer tokenizer = new Tokenizer(); + tokenizer.start(model); + } + + public String LinkToString(LinkedList mod) throws UnsupportedEncodingException { + int b = mod.size(); + byte[] be = new byte[b]; + for (int i = 0; i < b; i++) { + be[i] = mod.poll(); + } + return new String(be, charsetName); + } +} diff --git a/src/main/java/org/wlld/naturalLanguage/Tokenizer.java b/src/main/java/org/wlld/naturalLanguage/Tokenizer.java new file mode 100644 index 0000000..5481285 --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/Tokenizer.java @@ -0,0 +1,204 @@ +package org.wlld.naturalLanguage; + +import org.wlld.tools.ArithUtil; +import org.wlld.tools.Frequency; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author lidapeng + * @description 分词器 + * @date 7:42 上午 2020/2/23 + */ +public class Tokenizer extends Frequency { + private List sentences = WordTemple.get().getSentences();//所有断句 + private List allWorld = WordTemple.get().getAllWorld();//所有词集合 + private Word nowWord;//上一次出现的关键字 + + public void start(Map> model) { + //model的主键是类别,值是该类别语句的集合 + for (Map.Entry> mod : model.entrySet()) { + if (mod.getKey() != 0) { + List st = mod.getValue();//语句 + int key = mod.getKey();//类别 + for (String sentence : st) {//遍历每个类别的每个语句 + Sentence sentenceWords = new Sentence(key); + catchSentence(sentence, sentenceWords); + Word word = sentenceWords.getFirstWord(); + if (word != null) { + worldMuch(word, allWorld, key);//构建句子内的层级关系并添加词频 + } + } + } + } + restructure();//对集合中的词进行词频统计 + //这里分词已经结束,对词进行编号 + //test(); + } + + private void test() {//分词测试类 + for (Sentence sentence : sentences) { + System.out.println(sentence.getKeyWords()); + } + } + + private void restructure() {//对句子里面的Word进行词频统计 + for (Sentence words : sentences) { + List listWord = allWorld; + List waitWorld = words.getWaitWords(); + for (Word word : waitWorld) { + String myWord = word.getWord(); + WorldBody body = getBody(myWord, listWord); + listWord = body.getWorldBodies(); + word.setWordFrequency(body.getWordFrequency()); + } + } + for (Sentence words : sentences) { + radiation(words); + } + } + + public void radiation(Sentence sentenceWords) {//对句子中的词开始辐射延伸 + //首先词与它自己的右节点和左节点进行比较 + nowWord = null; + Word firstWord = sentenceWords.getFirstWord(); + KeyWord word = new KeyWord(); + word.setWord(firstWord); + word.setOk(false); + List keyWords = new ArrayList<>(); + while (word.getWord() != null) { + word = keyWord(-1, word, new double[]{firstWord.getWordFrequency()}); + Word myWord = word.getWord(); + String wordT = myWord.getWord();//当前截取到的分词串 + String keyWord; + if (nowWord == null) {//这句话的第一个分词还没有产生***** + //此时的分词结果就是WordT + keyWord = wordT; + } else {//之前产生了分词 + keyWord = wordT.substring(nowWord.getWord().length()); + } + keyWords.add(keyWord); + nowWord = myWord; + word.setOk(false); + word.setWord(word.getWord().getSon()); + } + sentenceWords.setKeyWords(keyWords); + } + + private double[] getDiff(double[] diff, Word word) { + double[] diffef = new double[diff.length + 1]; + for (int i = 0; i < diffef.length; i++) { + if (i == diffef.length - 1) { + diffef[i] = word.getWordFrequency(); + } else { + diffef[i] = diff[i]; + } + } + return diffef; + } + + private KeyWord keyWord(double dm, KeyWord words, double[] diff) {//平均差值,离散系数,是否为关键字 + double right = 0; + boolean bm = words.isOk(); + if (!bm) { + Word word = words.getWord(); + if (word.getSon() != null) { + double db = wordEnd(word, new ArrayList<>(), 0);//计算身前平均值 + //与它儿子词频的差要小于辐射向前的词频差的平均值 + boolean isAvgOk = (ArithUtil.mul(word.getWordFrequency() - word.getSon().getWordFrequency(), WordConst.Word_Noise)) <= db; + if (isAvgOk) {//平均值检测 + diff = getDiff(diff, word.getSon()); + right = dc(diff); + if (dm > -1) { + if (ArithUtil.mul(right, WordConst.Word_Noise) <= dm) {//继续向下探索 + words.setOk(false); + words.setWord(word.getSon()); + words = keyWord(right, words, diff); + } else {//截断,停止探索 + words.setOk(true);//是关键字 + } + } else {//第一次 继续向下探索 + words.setOk(false); + words.setWord(word.getSon()); + words = keyWord(right, words, diff); + } + } else {//截断 停止探索 + words.setOk(true); + } + } else {//截断 停止探索 + words.setOk(true); + } + } + return words; + } + + private double wordEnd(Word word, List av, double a) {//对一句话中的词进行处理 + //先取全句平均差值 + Word son = word.getSon(); + if (son != null) { + av.add(word.getWordFrequency() - son.getWordFrequency()); + a = wordEnd(son, av, a); + } else {//最后计算平均值 + double[] allNub = new double[av.size()]; + for (int i = 0; i < av.size(); i++) { + allNub[i] = av.get(i); + } + a = average(allNub);//平均差值 + } + return a; + } + + private WorldBody getBody(String word, List worlds) { + WorldBody myBody = null; + for (WorldBody body : worlds) { + if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) { + myBody = body; + break; + } + } + return myBody; + } + + private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开 + int len = sentence.length(); + for (int i = 0; i < len; i++) { + String word = sentence.substring(0, i + 1); + sentenceWords.setWord(word); + } + sentences.add(sentenceWords); + } + + private void worldMuch(Word word, List worldBodies, int type) {//分类词频处理 + boolean bm = false; + String check = word.getWord(); + for (WorldBody myWorld : worldBodies) { + String waitCheck = myWorld.getWordName(); + if (waitCheck.hashCode() == check.hashCode() && waitCheck.equals(check)) { + bm = true; + myWorld.addNub(type); + if (word.getSon() != null) {//没有找到最后一级了 + worldMuch(word.getSon(), myWorld.getWorldBodies(), type); + } + break; + } + } + if (!bm) {//找不到了 + saveList(word, worldBodies, type); + } + } + + private void saveList(Word word, List myWorld, int type) {//保存新词 + WorldBody body = new WorldBody(); + List list = new ArrayList<>(); + body.setWordName(word.getWord()); + body.addNub(type); + body.setWorldBodies(list); + body.setWord(word); + myWorld.add(body); + if (word.getSon() != null) { + saveList(word.getSon(), list, type); + } + } +} diff --git a/src/main/java/org/wlld/naturalLanguage/Word.java b/src/main/java/org/wlld/naturalLanguage/Word.java new file mode 100644 index 0000000..0fa610f --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/Word.java @@ -0,0 +1,41 @@ +package org.wlld.naturalLanguage; + + +public class Word { + private String word; + private Word son; + private int wordFrequency;//词频 + private int lv;//该词的时间序列 + + public int getLv() { + return lv; + } + + public void setLv(int lv) { + this.lv = lv; + } + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public Word getSon() { + return son; + } + + public void setSon(Word son) { + this.son = son; + } + + public int getWordFrequency() { + return wordFrequency; + } + + public void setWordFrequency(int wordFrequency) { + this.wordFrequency = wordFrequency; + } +} diff --git a/src/main/java/org/wlld/naturalLanguage/WordConst.java b/src/main/java/org/wlld/naturalLanguage/WordConst.java new file mode 100644 index 0000000..27e0a15 --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/WordConst.java @@ -0,0 +1,5 @@ +package org.wlld.naturalLanguage; + +public class WordConst { + public static double Word_Noise = 0.7;//收缩程度 +} diff --git a/src/main/java/org/wlld/naturalLanguage/WordTemple.java b/src/main/java/org/wlld/naturalLanguage/WordTemple.java new file mode 100644 index 0000000..8a9df9f --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/WordTemple.java @@ -0,0 +1,38 @@ +package org.wlld.naturalLanguage; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author lidapeng + * @description分词模版 + * @date 4:15 下午 2020/2/23 + */ +public class WordTemple { + private static WordTemple Word_Temple = new WordTemple(); + private List sentences = new ArrayList<>();//所有断句 + private List allWorld = new ArrayList<>();//所有词集合 + + private WordTemple() { + } + + public static WordTemple get() { + return Word_Temple; + } + + public List getSentences() { + return sentences; + } + + public void setSentences(List sentences) { + this.sentences = sentences; + } + + public List getAllWorld() { + return allWorld; + } + + public void setAllWorld(List allWorld) { + this.allWorld = allWorld; + } +} diff --git a/src/main/java/org/wlld/naturalLanguage/WorldBody.java b/src/main/java/org/wlld/naturalLanguage/WorldBody.java new file mode 100644 index 0000000..bf462d7 --- /dev/null +++ b/src/main/java/org/wlld/naturalLanguage/WorldBody.java @@ -0,0 +1,46 @@ +package org.wlld.naturalLanguage; + +import java.util.List; + +public class WorldBody { + private String wordName;//词 + private int wordFrequency;//词频 + private List worldBodies;//辐射集合 + private Word word; + private int type = 0; + + public String getWordName() { + return wordName; + } + + public void setWordName(String wordName) { + this.wordName = wordName; + } + + public int getWordFrequency() { + return wordFrequency; + } + + public void addNub(int type) { + if (this.type != 0) { + this.type = type; + } + wordFrequency++; + } + + public List getWorldBodies() { + return worldBodies; + } + + public void setWorldBodies(List worldBodies) { + this.worldBodies = worldBodies; + } + + public Word getWord() { + return word; + } + + public void setWord(Word word) { + this.word = word; + } +} diff --git a/src/main/java/org/wlld/randomForest/RandomForest.java b/src/main/java/org/wlld/randomForest/RandomForest.java index b7e1145..7e4f97d 100644 --- a/src/main/java/org/wlld/randomForest/RandomForest.java +++ b/src/main/java/org/wlld/randomForest/RandomForest.java @@ -79,14 +79,14 @@ public class RandomForest { } } - public void study() throws Exception { + public void study() throws Exception {//学习 for (int i = 0; i < forest.length; i++) { Tree tree = forest[i]; tree.study(); } } - public void insert(Object object) { + public void insert(Object object) {//添加学习参数 for (int i = 0; i < forest.length; i++) { Tree tree = forest[i]; tree.getDataTable().insert(object); diff --git a/src/main/java/org/wlld/tools/Frequency.java b/src/main/java/org/wlld/tools/Frequency.java new file mode 100644 index 0000000..eb77b24 --- /dev/null +++ b/src/main/java/org/wlld/tools/Frequency.java @@ -0,0 +1,53 @@ +package org.wlld.tools; + +public abstract class Frequency {//统计频数 + + public double average(double... m) {//计算平均值 + int len = m.length; + double allNub = 0; + for (int i = 0; i < len; i++) { + allNub = allNub + m[i]; + } + allNub = ArithUtil.div(allNub, len); + return allNub; + } + + public double getPointLength(double x, double y, double i, double j) {//获取两个二维坐标之间的欧式距离 + return Math.sqrt(ArithUtil.add(Math.pow(ArithUtil.sub(x, i), 2), Math.pow(ArithUtil.sub(y, j), 2))); + } + + public double variance(double... m) {//计算方差 + double ave = average(m);//先计算出平均值 + double allNub = 0; + for (int i = 0; i < m.length; i++) { + allNub = allNub + Math.pow(m[i] - ave, 2); + } + double var = ArithUtil.div(allNub, m.length); + return var; + } + + public double sd(double... m) {//计算标准差 + double var = variance(m); + return Math.sqrt(var); + } + + public double dc(double... m) {//计算离散系数 + double ave = average(m);//先计算出平均值 + double allNub = 0; + for (int i = 0; i < m.length; i++) { + allNub = allNub + Math.pow(m[i] - ave, 2); + } + double dc = ArithUtil.div(Math.sqrt(ArithUtil.div(allNub, m.length)), ave);//离散系数 + return dc; + } + + public double softMax(int t, double... m) {//下标和数组 + double my = Math.exp(m[t]); + double all = 0.0; + int allLength = m.length; + for (int i = 0; i < allLength; i++) { + all = all + Math.exp(m[i]); + } + return ArithUtil.div(my, all); + } +} diff --git a/src/test/java/org/wlld/LangTest.java b/src/test/java/org/wlld/LangTest.java new file mode 100644 index 0000000..47b776b --- /dev/null +++ b/src/test/java/org/wlld/LangTest.java @@ -0,0 +1,23 @@ +package org.wlld; + +import org.wlld.naturalLanguage.IOConst; +import org.wlld.naturalLanguage.Talk; +import org.wlld.naturalLanguage.TemplateReader; + +/** + * @author lidapeng + * @description + * @date 2:07 下午 2020/2/23 + */ +public class LangTest { + public static void main(String[] args) throws Exception { + test(); + } + + public static void test() throws Exception { + TemplateReader templateReader = new TemplateReader(); + templateReader.read("/Users/lidapeng/Desktop/myDocment/a.txt", "UTF-8", IOConst.NOT_WIN); + Talk talk = new Talk(); + talk.talk("我要吃面包"); + } +}