增加分词器

pull/1/head
lidapeng 5 years ago
parent 8abe63b1fc
commit c94423ace7

@ -6,8 +6,8 @@
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="myBrain" />
<module name="ImageMarket" />
<module name="myBrain" />
</profile>
</annotationProcessing>
</component>

@ -0,0 +1,10 @@
package org.wlld.naturalLanguage;
public class IOConst {
public static final byte TYPE_Symbol = 0x23;//#号键
public static final byte STOP_END = 10;
public static final byte STOP_NEXT = 13;
public static final byte WIN = 1;//windows系统
public static final byte NOT_WIN = 2;//非Windows系统
public static final byte CORE_Number = 6;//核心数
}

@ -0,0 +1,22 @@
package org.wlld.naturalLanguage;
public class KeyWord {
private Word word;//关键字
private boolean isOk;//是否完成此关键字
public Word getWord() {
return word;
}
public void setWord(Word word) {
this.word = word;
}
public boolean isOk() {
return isOk;
}
public void setOk(boolean ok) {
isOk = ok;
}
}

@ -0,0 +1,73 @@
package org.wlld.naturalLanguage;
import org.omg.Messaging.SYNC_WITH_TRANSPORT;
import java.util.ArrayList;
import java.util.List;
/**
* @author lidapeng
* @description
* @date 8:01 2020/2/23
*/
public class Sentence {
private Word firstWord;
private List<Word> waitWords = new ArrayList<>();//词
private List<String> keyWords;//分词结果下标按照时间序列排序
private int key;
public List<String> getKeyWords() {
return keyWords;
}
public void setKeyWords(List<String> keyWords) {
this.keyWords = keyWords;
}
public List<Word> getWaitWords() {
return waitWords;
}
public Word getFirstWord() {
return firstWord;
}
public Sentence() {
}
public Sentence(int key) {
if (key > -1) {
this.key = key;
}
}
public int getKey() {
return key;
}
public void setKey(int key) {
this.key = key;
}
private void lineWord(Word word, Word wordSon) {//给词连线
if (firstWord != null) {
if (word.getSon() != null) {//右连接不是空的
lineWord(word.getSon(), wordSon);
} else {//右连接是空的
wordSon.setLv(word.getLv() + 1);
word.setSon(wordSon);
}
} else {
firstWord = wordSon;
firstWord.setLv(1);
}
}
public void setWord(String word) {//编号
Word word1 = new Word();
word1.setWord(word);
lineWord(firstWord, word1);//词之间做连线
waitWords.add(word1);
}
}

@ -0,0 +1,77 @@
package org.wlld.naturalLanguage;
import java.util.ArrayList;
import java.util.List;
/**
* @author lidapeng
* @description
* @date 4:14 2020/2/23
*/
public class Talk {
private List<WorldBody> allWorld = WordTemple.get().getAllWorld();//所有词集合
public void talk(String sentence) {
String rgm = null;
if (sentence.indexOf(",") > -1) {
rgm = ",";
} else if (sentence.indexOf("") > -1) {
rgm = "";
}
String[] sens;
if (rgm != null) {
sens = sentence.split(rgm);
} else {
sens = new String[]{sentence};
}
//拆词
List<Sentence> sentences = new ArrayList<>();
for (int i = 0; i < sens.length; i++) {
Sentence sentenceWords = new Sentence();
catchSentence(sentence, sentenceWords);
sentences.add(sentenceWords);
}
restructure(sentences);
for (Sentence sentence1 : sentences) {
System.out.println(sentence1.getKeyWords());
}
}
private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开
int len = sentence.length();
for (int i = 0; i < len; i++) {
String word = sentence.substring(0, i + 1);
sentenceWords.setWord(word);
}
}
private void restructure(List<Sentence> sentences) {//对句子里面的Word进行词频统计
for (Sentence words : sentences) {
List<WorldBody> listWord = allWorld;
List<Word> waitWorld = words.getWaitWords();
for (Word word : waitWorld) {
String myWord = word.getWord();
WorldBody body = getBody(myWord, listWord);
listWord = body.getWorldBodies();
word.setWordFrequency(body.getWordFrequency());
}
}
Tokenizer tokenizer = new Tokenizer();
for (Sentence words : sentences) {
tokenizer.radiation(words);
}
}
private WorldBody getBody(String word, List<WorldBody> worlds) {
WorldBody myBody = null;
for (WorldBody body : worlds) {
if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) {
myBody = body;
break;
}
}
return myBody;
}
}

@ -0,0 +1,81 @@
package org.wlld.naturalLanguage;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.*;
public class TemplateReader {//模板读取类
private Map<Integer, List<String>> model = new HashMap<>();//训练模板
private String charsetName;
public void read(String url, String charsetName, byte sys) throws Exception {
this.charsetName = charsetName;
File file = new File(url);
InputStream is = new FileInputStream(file);
int i;
LinkedList<Byte> span = new LinkedList<>();
int hang = 0;
int again = 0;
int upNub = 0;
boolean isSymbol = false;//是否遇到分隔符
while ((i = is.read()) > -1) {
if (i == IOConst.TYPE_Symbol) {//遇到分隔符号
isSymbol = true;
} else {
if (i == IOConst.STOP_END || i == IOConst.STOP_NEXT) {
isSymbol = false;
again = again << 1 | 1;
if (again == 1) {//第一次进入
List<String> lr = model.get(upNub);
//addEnd(span);
if (lr != null) {
lr.add(LinkToString(span));
} else {
List<String> lis = new ArrayList<>();
lis.add(LinkToString(span));
model.put(upNub, lis);
}
upNub = 0;
hang++;
if (sys != IOConst.WIN) {
again = 0;
}
} else {
again = 0;
}
} else {
if (isSymbol) {
int type = i;
if (type >= 48 && type <= 57) {
type = type - 48;
if (upNub == 0) {
upNub = type;
} else {
upNub = upNub * 10 + type;
}
}
} else {
span.add((byte) i);
}
}
}
}
word();
}
public void word() {
Tokenizer tokenizer = new Tokenizer();
tokenizer.start(model);
}
public String LinkToString(LinkedList<Byte> mod) throws UnsupportedEncodingException {
int b = mod.size();
byte[] be = new byte[b];
for (int i = 0; i < b; i++) {
be[i] = mod.poll();
}
return new String(be, charsetName);
}
}

@ -0,0 +1,204 @@
package org.wlld.naturalLanguage;
import org.wlld.tools.ArithUtil;
import org.wlld.tools.Frequency;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author lidapeng
* @description
* @date 7:42 2020/2/23
*/
public class Tokenizer extends Frequency {
private List<Sentence> sentences = WordTemple.get().getSentences();//所有断句
private List<WorldBody> allWorld = WordTemple.get().getAllWorld();//所有词集合
private Word nowWord;//上一次出现的关键字
public void start(Map<Integer, List<String>> model) {
//model的主键是类别值是该类别语句的集合
for (Map.Entry<Integer, List<String>> mod : model.entrySet()) {
if (mod.getKey() != 0) {
List<String> st = mod.getValue();//语句
int key = mod.getKey();//类别
for (String sentence : st) {//遍历每个类别的每个语句
Sentence sentenceWords = new Sentence(key);
catchSentence(sentence, sentenceWords);
Word word = sentenceWords.getFirstWord();
if (word != null) {
worldMuch(word, allWorld, key);//构建句子内的层级关系并添加词频
}
}
}
}
restructure();//对集合中的词进行词频统计
//这里分词已经结束,对词进行编号
//test();
}
private void test() {//分词测试类
for (Sentence sentence : sentences) {
System.out.println(sentence.getKeyWords());
}
}
private void restructure() {//对句子里面的Word进行词频统计
for (Sentence words : sentences) {
List<WorldBody> listWord = allWorld;
List<Word> waitWorld = words.getWaitWords();
for (Word word : waitWorld) {
String myWord = word.getWord();
WorldBody body = getBody(myWord, listWord);
listWord = body.getWorldBodies();
word.setWordFrequency(body.getWordFrequency());
}
}
for (Sentence words : sentences) {
radiation(words);
}
}
public void radiation(Sentence sentenceWords) {//对句子中的词开始辐射延伸
//首先词与它自己的右节点和左节点进行比较
nowWord = null;
Word firstWord = sentenceWords.getFirstWord();
KeyWord word = new KeyWord();
word.setWord(firstWord);
word.setOk(false);
List<String> keyWords = new ArrayList<>();
while (word.getWord() != null) {
word = keyWord(-1, word, new double[]{firstWord.getWordFrequency()});
Word myWord = word.getWord();
String wordT = myWord.getWord();//当前截取到的分词串
String keyWord;
if (nowWord == null) {//这句话的第一个分词还没有产生*****
//此时的分词结果就是WordT
keyWord = wordT;
} else {//之前产生了分词
keyWord = wordT.substring(nowWord.getWord().length());
}
keyWords.add(keyWord);
nowWord = myWord;
word.setOk(false);
word.setWord(word.getWord().getSon());
}
sentenceWords.setKeyWords(keyWords);
}
private double[] getDiff(double[] diff, Word word) {
double[] diffef = new double[diff.length + 1];
for (int i = 0; i < diffef.length; i++) {
if (i == diffef.length - 1) {
diffef[i] = word.getWordFrequency();
} else {
diffef[i] = diff[i];
}
}
return diffef;
}
private KeyWord keyWord(double dm, KeyWord words, double[] diff) {//平均差值,离散系数,是否为关键字
double right = 0;
boolean bm = words.isOk();
if (!bm) {
Word word = words.getWord();
if (word.getSon() != null) {
double db = wordEnd(word, new ArrayList<>(), 0);//计算身前平均值
//与它儿子词频的差要小于辐射向前的词频差的平均值
boolean isAvgOk = (ArithUtil.mul(word.getWordFrequency() - word.getSon().getWordFrequency(), WordConst.Word_Noise)) <= db;
if (isAvgOk) {//平均值检测
diff = getDiff(diff, word.getSon());
right = dc(diff);
if (dm > -1) {
if (ArithUtil.mul(right, WordConst.Word_Noise) <= dm) {//继续向下探索
words.setOk(false);
words.setWord(word.getSon());
words = keyWord(right, words, diff);
} else {//截断,停止探索
words.setOk(true);//是关键字
}
} else {//第一次 继续向下探索
words.setOk(false);
words.setWord(word.getSon());
words = keyWord(right, words, diff);
}
} else {//截断 停止探索
words.setOk(true);
}
} else {//截断 停止探索
words.setOk(true);
}
}
return words;
}
private double wordEnd(Word word, List<Integer> av, double a) {//对一句话中的词进行处理
//先取全句平均差值
Word son = word.getSon();
if (son != null) {
av.add(word.getWordFrequency() - son.getWordFrequency());
a = wordEnd(son, av, a);
} else {//最后计算平均值
double[] allNub = new double[av.size()];
for (int i = 0; i < av.size(); i++) {
allNub[i] = av.get(i);
}
a = average(allNub);//平均差值
}
return a;
}
private WorldBody getBody(String word, List<WorldBody> worlds) {
WorldBody myBody = null;
for (WorldBody body : worlds) {
if (body.getWordName().hashCode() == word.hashCode() && body.getWordName().equals(word)) {
myBody = body;
break;
}
}
return myBody;
}
private void catchSentence(String sentence, Sentence sentenceWords) {//把句子拆开
int len = sentence.length();
for (int i = 0; i < len; i++) {
String word = sentence.substring(0, i + 1);
sentenceWords.setWord(word);
}
sentences.add(sentenceWords);
}
private void worldMuch(Word word, List<WorldBody> worldBodies, int type) {//分类词频处理
boolean bm = false;
String check = word.getWord();
for (WorldBody myWorld : worldBodies) {
String waitCheck = myWorld.getWordName();
if (waitCheck.hashCode() == check.hashCode() && waitCheck.equals(check)) {
bm = true;
myWorld.addNub(type);
if (word.getSon() != null) {//没有找到最后一级了
worldMuch(word.getSon(), myWorld.getWorldBodies(), type);
}
break;
}
}
if (!bm) {//找不到了
saveList(word, worldBodies, type);
}
}
private void saveList(Word word, List<WorldBody> myWorld, int type) {//保存新词
WorldBody body = new WorldBody();
List<WorldBody> list = new ArrayList<>();
body.setWordName(word.getWord());
body.addNub(type);
body.setWorldBodies(list);
body.setWord(word);
myWorld.add(body);
if (word.getSon() != null) {
saveList(word.getSon(), list, type);
}
}
}

@ -0,0 +1,41 @@
package org.wlld.naturalLanguage;
public class Word {
private String word;
private Word son;
private int wordFrequency;//词频
private int lv;//该词的时间序列
public int getLv() {
return lv;
}
public void setLv(int lv) {
this.lv = lv;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public Word getSon() {
return son;
}
public void setSon(Word son) {
this.son = son;
}
public int getWordFrequency() {
return wordFrequency;
}
public void setWordFrequency(int wordFrequency) {
this.wordFrequency = wordFrequency;
}
}

@ -0,0 +1,5 @@
package org.wlld.naturalLanguage;
public class WordConst {
public static double Word_Noise = 0.7;//收缩程度
}

@ -0,0 +1,38 @@
package org.wlld.naturalLanguage;
import java.util.ArrayList;
import java.util.List;
/**
* @author lidapeng
* @description
* @date 4:15 2020/2/23
*/
public class WordTemple {
private static WordTemple Word_Temple = new WordTemple();
private List<Sentence> sentences = new ArrayList<>();//所有断句
private List<WorldBody> allWorld = new ArrayList<>();//所有词集合
private WordTemple() {
}
public static WordTemple get() {
return Word_Temple;
}
public List<Sentence> getSentences() {
return sentences;
}
public void setSentences(List<Sentence> sentences) {
this.sentences = sentences;
}
public List<WorldBody> getAllWorld() {
return allWorld;
}
public void setAllWorld(List<WorldBody> allWorld) {
this.allWorld = allWorld;
}
}

@ -0,0 +1,46 @@
package org.wlld.naturalLanguage;
import java.util.List;
public class WorldBody {
private String wordName;//词
private int wordFrequency;//词频
private List<WorldBody> worldBodies;//辐射集合
private Word word;
private int type = 0;
public String getWordName() {
return wordName;
}
public void setWordName(String wordName) {
this.wordName = wordName;
}
public int getWordFrequency() {
return wordFrequency;
}
public void addNub(int type) {
if (this.type != 0) {
this.type = type;
}
wordFrequency++;
}
public List<WorldBody> getWorldBodies() {
return worldBodies;
}
public void setWorldBodies(List<WorldBody> worldBodies) {
this.worldBodies = worldBodies;
}
public Word getWord() {
return word;
}
public void setWord(Word word) {
this.word = word;
}
}

@ -79,14 +79,14 @@ public class RandomForest {
}
}
public void study() throws Exception {
public void study() throws Exception {//学习
for (int i = 0; i < forest.length; i++) {
Tree tree = forest[i];
tree.study();
}
}
public void insert(Object object) {
public void insert(Object object) {//添加学习参数
for (int i = 0; i < forest.length; i++) {
Tree tree = forest[i];
tree.getDataTable().insert(object);

@ -0,0 +1,53 @@
package org.wlld.tools;
public abstract class Frequency {//统计频数
public double average(double... m) {//计算平均值
int len = m.length;
double allNub = 0;
for (int i = 0; i < len; i++) {
allNub = allNub + m[i];
}
allNub = ArithUtil.div(allNub, len);
return allNub;
}
public double getPointLength(double x, double y, double i, double j) {//获取两个二维坐标之间的欧式距离
return Math.sqrt(ArithUtil.add(Math.pow(ArithUtil.sub(x, i), 2), Math.pow(ArithUtil.sub(y, j), 2)));
}
public double variance(double... m) {//计算方差
double ave = average(m);//先计算出平均值
double allNub = 0;
for (int i = 0; i < m.length; i++) {
allNub = allNub + Math.pow(m[i] - ave, 2);
}
double var = ArithUtil.div(allNub, m.length);
return var;
}
public double sd(double... m) {//计算标准差
double var = variance(m);
return Math.sqrt(var);
}
public double dc(double... m) {//计算离散系数
double ave = average(m);//先计算出平均值
double allNub = 0;
for (int i = 0; i < m.length; i++) {
allNub = allNub + Math.pow(m[i] - ave, 2);
}
double dc = ArithUtil.div(Math.sqrt(ArithUtil.div(allNub, m.length)), ave);//离散系数
return dc;
}
public double softMax(int t, double... m) {//下标和数组
double my = Math.exp(m[t]);
double all = 0.0;
int allLength = m.length;
for (int i = 0; i < allLength; i++) {
all = all + Math.exp(m[i]);
}
return ArithUtil.div(my, all);
}
}

@ -0,0 +1,23 @@
package org.wlld;
import org.wlld.naturalLanguage.IOConst;
import org.wlld.naturalLanguage.Talk;
import org.wlld.naturalLanguage.TemplateReader;
/**
* @author lidapeng
* @description
* @date 2:07 2020/2/23
*/
public class LangTest {
public static void main(String[] args) throws Exception {
test();
}
public static void test() throws Exception {
TemplateReader templateReader = new TemplateReader();
templateReader.read("/Users/lidapeng/Desktop/myDocment/a.txt", "UTF-8", IOConst.NOT_WIN);
Talk talk = new Talk();
talk.talk("我要吃面包");
}
}
Loading…
Cancel
Save