增加分词器

5 years ago · c94423ace7
parent 8abe63b1fc
commit c94423ace7
14 changed files with 676 additions and 3 deletions
--- a/.idea/compiler.xml
+++ b/.idea/compiler.xml
@ -6,8 +6,8 @@
        <sourceOutputDir name="target/generated-sources/annotations" />
        <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
        <outputRelativeToContentRoot value="true" />
-        <module name="myBrain" />
        <module name="ImageMarket" />
+        <module name="myBrain" />
      </profile>
    </annotationProcessing>
  </component>
--- a/src/main/java/org/wlld/naturalLanguage/IOConst.java
+++ b/src/main/java/org/wlld/naturalLanguage/IOConst.java
@ -0,0 +1,10 @@
+package org.wlld.naturalLanguage;
+
+public class IOConst {
+    public static final byte TYPE_Symbol = 0x23;//#号键
+    public static final byte STOP_END = 10;
+    public static final byte STOP_NEXT = 13;
+    public static final byte WIN = 1;//windows系统
+    public static final byte NOT_WIN = 2;//非Windows系统
+    public static final byte CORE_Number = 6;//核心数
+}
--- a/src/main/java/org/wlld/naturalLanguage/KeyWord.java
+++ b/src/main/java/org/wlld/naturalLanguage/KeyWord.java
@ -0,0 +1,22 @@
+package org.wlld.naturalLanguage;
+
+public class KeyWord {
+    private  Word word;//关键字
+    private  boolean isOk;//是否完成此关键字
+
+    public Word getWord() {
+        return word;
+    }
+
+    public void setWord(Word word) {
+        this.word = word;
+    }
+
+    public boolean isOk() {
+        return isOk;
+    }
+
+    public void setOk(boolean ok) {
+        isOk = ok;
+    }
+}
--- a/src/main/java/org/wlld/naturalLanguage/Sentence.java
+++ b/src/main/java/org/wlld/naturalLanguage/Sentence.java
@ -0,0 +1,73 @@
+package org.wlld.naturalLanguage;
+
+import org.omg.Messaging.SYNC_WITH_TRANSPORT;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author lidapeng
+ * @description
+ * @date 8:01 上午 2020/2/23
+ */
+public class Sentence {
+    private Word firstWord;
+    private List<Word> waitWords = new ArrayList<>();//词
+    private List<String> keyWords;//分词结果下标按照时间序列排序
+    private int key;
+
+    public List<String> getKeyWords() {
+        return keyWords;
+    }
+
+    public void setKeyWords(List<String> keyWords) {
+        this.keyWords = keyWords;
+    }
+
+    public List<Word> getWaitWords() {
+        return waitWords;
+    }
+
+    public Word getFirstWord() {
+        return firstWord;
+    }
+
+    public Sentence() {
+
+    }
+
+    public Sentence(int key) {
+        if (key > -1) {
+            this.key = key;
+        }
+    }
+
+    public int getKey() {
+        return key;
+    }
+
+    public void setKey(int key) {
+        this.key = key;
+    }
+
+    private void lineWord(Word word, Word wordSon) {//给词连线
+        if (firstWord != null) {
+            if (word.getSon() != null) {//右连接不是空的
+                lineWord(word.getSon(), wordSon);
+            } else {//右连接是空的
+                wordSon.setLv(word.getLv() + 1);
+                word.setSon(wordSon);
+            }
+        } else {
+            firstWord = wordSon;
+            firstWord.setLv(1);
+        }
+    }
+
+    public void setWord(String word) {//编号
+        Word word1 = new Word();
+        word1.setWord(word);
+        lineWord(firstWord, word1);//词之间做连线
+        waitWords.add(word1);
+    }
+}
--- a/src/main/java/org/wlld/naturalLanguage/Talk.java
+++ b/src/main/java/org/wlld/naturalLanguage/Talk.java
@ -0,0 +1,77 @@
+package org.wlld.naturalLanguage;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author lidapeng
+ * @description 语句分类
+ * @date 4:14 下午 2020/2/23
+ */
+public class Talk {
+    private List<WorldBody> allWorld = WordTemple.get().getAllWorld();//所有词集合
+
+    public void talk(String sentence) {
+        String rgm = null;
+        if (sentence.indexOf(",") > -1) {
+            rgm = ",";
+        } else if (sentence.indexOf("，") > -1) {
+            rgm = "，";
+        }
+        String[] sens;
+        if (rgm != null) {
+            sens = sentence.split(rgm);
+        } else {
+            sens = new String[]{sentence};
+        }
+        //拆词
+        List<Sentence> sentences = new ArrayList<>();
+        for (int i = 0; i < sens.length; i++) {
+            Sentence sentenceWords = new Sentence();
+            catchSentence(sentence, sentenceWords);
+            sentences.add(sentenceWords);
+        }
+        restructure(sentences);
+        for (Sentence sentence1 : sentences) {
+            System.out.println(sentence1.getKeyWords());
+        }
+    }
+
+    private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开
+        int len = sentence.length();
+        for (int i = 0; i < len; i++) {
+            String word = sentence.substring(0, i + 1);
+            sentenceWords.setWord(word);
+        }
+
+    }
+
+    private void restructure(List<Sentence> sentences) {//对句子里面的Word进行词频统计
+        for (Sentence words : sentences) {
+            List<WorldBody> listWord = allWorld;
+            List<Word> waitWorld = words.getWaitWords();
+            for (Word word : waitWorld) {
+                String myWord = word.getWord();
+                WorldBody body = getBody(myWord, listWord);
+                listWord = body.getWorldBodies();
+                word.setWordFrequency(body.getWordFrequency());
+            }
+        }
+        Tokenizer tokenizer = new Tokenizer();
+        for (Sentence words : sentences) {
+            tokenizer.radiation(words);
+        }
+    }
+
+    private WorldBody getBody(String word, List<WorldBody> worlds) {
+        WorldBody myBody = null;
+        for (WorldBody body : worlds) {
+            if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) {
+                myBody = body;
+                break;
+            }
+        }
+        return myBody;
+    }
+}
--- a/src/main/java/org/wlld/naturalLanguage/TemplateReader.java
+++ b/src/main/java/org/wlld/naturalLanguage/TemplateReader.java
@ -0,0 +1,81 @@
+package org.wlld.naturalLanguage;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.*;
+
+public class TemplateReader {//模板读取类
+    private Map<Integer, List<String>> model = new HashMap<>();//训练模板
+    private String charsetName;
+
+    public void read(String url, String charsetName, byte sys) throws Exception {
+        this.charsetName = charsetName;
+        File file = new File(url);
+        InputStream is = new FileInputStream(file);
+        int i;
+        LinkedList<Byte> span = new LinkedList<>();
+        int hang = 0;
+        int again = 0;
+        int upNub = 0;
+        boolean isSymbol = false;//是否遇到分隔符
+        while ((i = is.read()) > -1) {
+            if (i == IOConst.TYPE_Symbol) {//遇到分隔符号
+                isSymbol = true;
+            } else {
+                if (i == IOConst.STOP_END || i == IOConst.STOP_NEXT) {
+                    isSymbol = false;
+                    again = again << 1 | 1;
+                    if (again == 1) {//第一次进入
+                        List<String> lr = model.get(upNub);
+                        //addEnd(span);
+                        if (lr != null) {
+                            lr.add(LinkToString(span));
+                        } else {
+                            List<String> lis = new ArrayList<>();
+                            lis.add(LinkToString(span));
+                            model.put(upNub, lis);
+                        }
+                        upNub = 0;
+                        hang++;
+                        if (sys != IOConst.WIN) {
+                            again = 0;
+                        }
+                    } else {
+                        again = 0;
+                    }
+                } else {
+                    if (isSymbol) {
+                        int type = i;
+                        if (type >= 48 && type <= 57) {
+                            type = type - 48;
+                            if (upNub == 0) {
+                                upNub = type;
+                            } else {
+                                upNub = upNub * 10 + type;
+                            }
+                        }
+                    } else {
+                        span.add((byte) i);
+                    }
+                }
+            }
+        }
+        word();
+    }
+
+    public void word() {
+        Tokenizer tokenizer = new Tokenizer();
+        tokenizer.start(model);
+    }
+
+    public String LinkToString(LinkedList<Byte> mod) throws UnsupportedEncodingException {
+        int b = mod.size();
+        byte[] be = new byte[b];
+        for (int i = 0; i < b; i++) {
+            be[i] = mod.poll();
+        }
+        return new String(be, charsetName);
+    }
+}
--- a/src/main/java/org/wlld/naturalLanguage/Tokenizer.java
+++ b/src/main/java/org/wlld/naturalLanguage/Tokenizer.java
@ -0,0 +1,204 @@
+package org.wlld.naturalLanguage;
+
+import org.wlld.tools.ArithUtil;
+import org.wlld.tools.Frequency;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author lidapeng
+ * @description 分词器
+ * @date 7:42 上午 2020/2/23
+ */
+public class Tokenizer extends Frequency {
+    private List<Sentence> sentences = WordTemple.get().getSentences();//所有断句
+    private List<WorldBody> allWorld = WordTemple.get().getAllWorld();//所有词集合
+    private Word nowWord;//上一次出现的关键字
+
+    public void start(Map<Integer, List<String>> model) {
+        //model的主键是类别，值是该类别语句的集合
+        for (Map.Entry<Integer, List<String>> mod : model.entrySet()) {
+            if (mod.getKey() != 0) {
+                List<String> st = mod.getValue();//语句
+                int key = mod.getKey();//类别
+                for (String sentence : st) {//遍历每个类别的每个语句
+                    Sentence sentenceWords = new Sentence(key);
+                    catchSentence(sentence, sentenceWords);
+                    Word word = sentenceWords.getFirstWord();
+                    if (word != null) {
+                        worldMuch(word, allWorld, key);//构建句子内的层级关系并添加词频
+                    }
+                }
+            }
+        }
+        restructure();//对集合中的词进行词频统计
+        //这里分词已经结束,对词进行编号
+        //test();
+    }
+
+    private void test() {//分词测试类
+        for (Sentence sentence : sentences) {
+            System.out.println(sentence.getKeyWords());
+        }
+    }
+
+    private void restructure() {//对句子里面的Word进行词频统计
+        for (Sentence words : sentences) {
+            List<WorldBody> listWord = allWorld;
+            List<Word> waitWorld = words.getWaitWords();
+            for (Word word : waitWorld) {
+                String myWord = word.getWord();
+                WorldBody body = getBody(myWord, listWord);
+                listWord = body.getWorldBodies();
+                word.setWordFrequency(body.getWordFrequency());
+            }
+        }
+        for (Sentence words : sentences) {
+            radiation(words);
+        }
+    }
+
+    public void radiation(Sentence sentenceWords) {//对句子中的词开始辐射延伸
+        //首先词与它自己的右节点和左节点进行比较
+        nowWord = null;
+        Word firstWord = sentenceWords.getFirstWord();
+        KeyWord word = new KeyWord();
+        word.setWord(firstWord);
+        word.setOk(false);
+        List<String> keyWords = new ArrayList<>();
+        while (word.getWord() != null) {
+            word = keyWord(-1, word, new double[]{firstWord.getWordFrequency()});
+            Word myWord = word.getWord();
+            String wordT = myWord.getWord();//当前截取到的分词串
+            String keyWord;
+            if (nowWord == null) {//这句话的第一个分词还没有产生*****
+                //此时的分词结果就是WordT
+                keyWord = wordT;
+            } else {//之前产生了分词
+                keyWord = wordT.substring(nowWord.getWord().length());
+            }
+            keyWords.add(keyWord);
+            nowWord = myWord;
+            word.setOk(false);
+            word.setWord(word.getWord().getSon());
+        }
+        sentenceWords.setKeyWords(keyWords);
+    }
+
+    private double[] getDiff(double[] diff, Word word) {
+        double[] diffef = new double[diff.length + 1];
+        for (int i = 0; i < diffef.length; i++) {
+            if (i == diffef.length - 1) {
+                diffef[i] = word.getWordFrequency();
+            } else {
+                diffef[i] = diff[i];
+            }
+        }
+        return diffef;
+    }
+
+    private KeyWord keyWord(double dm, KeyWord words, double[] diff) {//平均差值，离散系数，是否为关键字
+        double right = 0;
+        boolean bm = words.isOk();
+        if (!bm) {
+            Word word = words.getWord();
+            if (word.getSon() != null) {
+                double db = wordEnd(word, new ArrayList<>(), 0);//计算身前平均值
+                //与它儿子词频的差要小于辐射向前的词频差的平均值
+                boolean isAvgOk = (ArithUtil.mul(word.getWordFrequency() - word.getSon().getWordFrequency(), WordConst.Word_Noise)) <= db;
+                if (isAvgOk) {//平均值检测
+                    diff = getDiff(diff, word.getSon());
+                    right = dc(diff);
+                    if (dm > -1) {
+                        if (ArithUtil.mul(right, WordConst.Word_Noise) <= dm) {//继续向下探索
+                            words.setOk(false);
+                            words.setWord(word.getSon());
+                            words = keyWord(right, words, diff);
+                        } else {//截断，停止探索
+                            words.setOk(true);//是关键字
+                        }
+                    } else {//第一次 继续向下探索
+                        words.setOk(false);
+                        words.setWord(word.getSon());
+                        words = keyWord(right, words, diff);
+                    }
+                } else {//截断 停止探索
+                    words.setOk(true);
+                }
+            } else {//截断 停止探索
+                words.setOk(true);
+            }
+        }
+        return words;
+    }
+
+    private double wordEnd(Word word, List<Integer> av, double a) {//对一句话中的词进行处理
+        //先取全句平均差值
+        Word son = word.getSon();
+        if (son != null) {
+            av.add(word.getWordFrequency() - son.getWordFrequency());
+            a = wordEnd(son, av, a);
+        } else {//最后计算平均值
+            double[] allNub = new double[av.size()];
+            for (int i = 0; i < av.size(); i++) {
+                allNub[i] = av.get(i);
+            }
+            a = average(allNub);//平均差值
+        }
+        return a;
+    }
+
+    private WorldBody getBody(String word, List<WorldBody> worlds) {
+        WorldBody myBody = null;
+        for (WorldBody body : worlds) {
+            if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) {
+                myBody = body;
+                break;
+            }
+        }
+        return myBody;
+    }
+
+    private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开
+        int len = sentence.length();
+        for (int i = 0; i < len; i++) {
+            String word = sentence.substring(0, i + 1);
+            sentenceWords.setWord(word);
+        }
+        sentences.add(sentenceWords);
+    }
+
+    private void worldMuch(Word word, List<WorldBody> worldBodies, int type) {//分类词频处理
+        boolean bm = false;
+        String check = word.getWord();
+        for (WorldBody myWorld : worldBodies) {
+            String waitCheck = myWorld.getWordName();
+            if (waitCheck.hashCode() == check.hashCode() && waitCheck.equals(check)) {
+                bm = true;
+                myWorld.addNub(type);
+                if (word.getSon() != null) {//没有找到最后一级了
+                    worldMuch(word.getSon(), myWorld.getWorldBodies(), type);
+                }
+                break;
+            }
+        }
+        if (!bm) {//找不到了
+            saveList(word, worldBodies, type);
+        }
+    }
+
+    private void saveList(Word word, List<WorldBody> myWorld, int type) {//保存新词
+        WorldBody body = new WorldBody();
+        List<WorldBody> list = new ArrayList<>();
+        body.setWordName(word.getWord());
+        body.addNub(type);
+        body.setWorldBodies(list);
+        body.setWord(word);
+        myWorld.add(body);
+        if (word.getSon() != null) {
+            saveList(word.getSon(), list, type);
+        }
+    }
+}
--- a/src/main/java/org/wlld/naturalLanguage/Word.java
+++ b/src/main/java/org/wlld/naturalLanguage/Word.java
@ -0,0 +1,41 @@
+package org.wlld.naturalLanguage;
+
+
+public class Word {
+    private String word;
+    private Word son;
+    private int wordFrequency;//词频
+    private int lv;//该词的时间序列
+
+    public int getLv() {
+        return lv;
+    }
+
+    public void setLv(int lv) {
+        this.lv = lv;
+    }
+
+    public String getWord() {
+        return word;
+    }
+
+    public void setWord(String word) {
+        this.word = word;
+    }
+
+    public Word getSon() {
+        return son;
+    }
+
+    public void setSon(Word son) {
+        this.son = son;
+    }
+
+    public int getWordFrequency() {
+        return wordFrequency;
+    }
+
+    public void setWordFrequency(int wordFrequency) {
+        this.wordFrequency = wordFrequency;
+    }
+}
--- a/src/main/java/org/wlld/naturalLanguage/WordConst.java
+++ b/src/main/java/org/wlld/naturalLanguage/WordConst.java
@ -0,0 +1,5 @@
+package org.wlld.naturalLanguage;
+
+public class WordConst {
+    public static double Word_Noise = 0.7;//收缩程度
+}
--- a/src/main/java/org/wlld/naturalLanguage/WordTemple.java
+++ b/src/main/java/org/wlld/naturalLanguage/WordTemple.java
@ -0,0 +1,38 @@
+package org.wlld.naturalLanguage;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author lidapeng
+ * @description分词模版
+ * @date 4:15 下午 2020/2/23
+ */
+public class WordTemple {
+    private static WordTemple Word_Temple = new WordTemple();
+    private List<Sentence> sentences = new ArrayList<>();//所有断句
+    private List<WorldBody> allWorld = new ArrayList<>();//所有词集合
+
+    private WordTemple() {
+    }
+
+    public static WordTemple get() {
+        return Word_Temple;
+    }
+
+    public List<Sentence> getSentences() {
+        return sentences;
+    }
+
+    public void setSentences(List<Sentence> sentences) {
+        this.sentences = sentences;
+    }
+
+    public List<WorldBody> getAllWorld() {
+        return allWorld;
+    }
+
+    public void setAllWorld(List<WorldBody> allWorld) {
+        this.allWorld = allWorld;
+    }
+}
--- a/src/main/java/org/wlld/naturalLanguage/WorldBody.java
+++ b/src/main/java/org/wlld/naturalLanguage/WorldBody.java
@ -0,0 +1,46 @@
+package org.wlld.naturalLanguage;
+
+import java.util.List;
+
+public class WorldBody {
+    private String wordName;//词
+    private int wordFrequency;//词频
+    private List<WorldBody> worldBodies;//辐射集合
+    private Word word;
+    private int type = 0;
+
+    public String getWordName() {
+        return wordName;
+    }
+
+    public void setWordName(String wordName) {
+        this.wordName = wordName;
+    }
+
+    public int getWordFrequency() {
+        return wordFrequency;
+    }
+
+    public void addNub(int type) {
+        if (this.type != 0) {
+            this.type = type;
+        }
+        wordFrequency++;
+    }
+
+    public List<WorldBody> getWorldBodies() {
+        return worldBodies;
+    }
+
+    public void setWorldBodies(List<WorldBody> worldBodies) {
+        this.worldBodies = worldBodies;
+    }
+
+    public Word getWord() {
+        return word;
+    }
+
+    public void setWord(Word word) {
+        this.word = word;
+    }
+}
--- a/src/main/java/org/wlld/randomForest/RandomForest.java
+++ b/src/main/java/org/wlld/randomForest/RandomForest.java
@ -79,14 +79,14 @@ public class RandomForest {
        }
    }

-    public void study() throws Exception {
+    public void study() throws Exception {//学习
        for (int i = 0; i < forest.length; i++) {
            Tree tree = forest[i];
            tree.study();
        }
    }

-    public void insert(Object object) {
+    public void insert(Object object) {//添加学习参数
        for (int i = 0; i < forest.length; i++) {
            Tree tree = forest[i];
            tree.getDataTable().insert(object);
--- a/src/main/java/org/wlld/tools/Frequency.java
+++ b/src/main/java/org/wlld/tools/Frequency.java
@ -0,0 +1,53 @@
+package org.wlld.tools;
+
+public abstract class Frequency {//统计频数
+
+    public double average(double... m) {//计算平均值
+        int len = m.length;
+        double allNub = 0;
+        for (int i = 0; i < len; i++) {
+            allNub = allNub + m[i];
+        }
+        allNub = ArithUtil.div(allNub, len);
+        return allNub;
+    }
+
+    public double getPointLength(double x, double y, double i, double j) {//获取两个二维坐标之间的欧式距离
+        return Math.sqrt(ArithUtil.add(Math.pow(ArithUtil.sub(x, i), 2), Math.pow(ArithUtil.sub(y, j), 2)));
+    }
+
+    public double variance(double... m) {//计算方差
+        double ave = average(m);//先计算出平均值
+        double allNub = 0;
+        for (int i = 0; i < m.length; i++) {
+            allNub = allNub + Math.pow(m[i] - ave, 2);
+        }
+        double var = ArithUtil.div(allNub, m.length);
+        return var;
+    }
+
+    public double sd(double... m) {//计算标准差
+        double var = variance(m);
+        return Math.sqrt(var);
+    }
+
+    public double dc(double... m) {//计算离散系数
+        double ave = average(m);//先计算出平均值
+        double allNub = 0;
+        for (int i = 0; i < m.length; i++) {
+            allNub = allNub + Math.pow(m[i] - ave, 2);
+        }
+        double dc = ArithUtil.div(Math.sqrt(ArithUtil.div(allNub, m.length)), ave);//离散系数
+        return dc;
+    }
+
+    public double softMax(int t, double... m) {//下标和数组
+        double my = Math.exp(m[t]);
+        double all = 0.0;
+        int allLength = m.length;
+        for (int i = 0; i < allLength; i++) {
+            all = all + Math.exp(m[i]);
+        }
+        return ArithUtil.div(my, all);
+    }
+}
--- a/src/test/java/org/wlld/LangTest.java
+++ b/src/test/java/org/wlld/LangTest.java
@ -0,0 +1,23 @@
+package org.wlld;
+
+import org.wlld.naturalLanguage.IOConst;
+import org.wlld.naturalLanguage.Talk;
+import org.wlld.naturalLanguage.TemplateReader;
+
+/**
+ * @author lidapeng
+ * @description
+ * @date 2:07 下午 2020/2/23
+ */
+public class LangTest {
+    public static void main(String[] args) throws Exception {
+        test();
+    }
+
+    public static void test() throws Exception {
+        TemplateReader templateReader = new TemplateReader();
+        templateReader.read("/Users/lidapeng/Desktop/myDocment/a.txt", "UTF-8", IOConst.NOT_WIN);
+        Talk talk = new Talk();
+        talk.talk("我要吃面包");
+    }
+}