注释格式化;
This commit is contained in:
parent
5ab517079b
commit
56f23a9027
|
@ -34,14 +34,18 @@ import java.util.Arrays;
|
||||||
*/
|
*/
|
||||||
class LetterSegmenter implements ISegmenter {
|
class LetterSegmenter implements ISegmenter {
|
||||||
|
|
||||||
//子分词器标签
|
/**
|
||||||
|
* 子分词器标签
|
||||||
|
*/
|
||||||
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
|
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
|
||||||
//链接符号
|
/**
|
||||||
|
* 链接符号
|
||||||
|
*/
|
||||||
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
|
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
|
||||||
|
/**
|
||||||
//数字符号
|
* 数字符号
|
||||||
|
*/
|
||||||
private static final char[] Num_Connector = new char[]{',', '.'};
|
private static final char[] Num_Connector = new char[]{',', '.'};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 词元的开始位置,
|
* 词元的开始位置,
|
||||||
* 同时作为子分词器状态标识
|
* 同时作为子分词器状态标识
|
||||||
|
@ -53,22 +57,18 @@ class LetterSegmenter implements ISegmenter {
|
||||||
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
|
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
|
||||||
*/
|
*/
|
||||||
private int end;
|
private int end;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 字母起始位置
|
* 字母起始位置
|
||||||
*/
|
*/
|
||||||
private int englishStart;
|
private int englishStart;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 字母结束位置
|
* 字母结束位置
|
||||||
*/
|
*/
|
||||||
private int englishEnd;
|
private int englishEnd;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 阿拉伯数字起始位置
|
* 阿拉伯数字起始位置
|
||||||
*/
|
*/
|
||||||
private int arabicStart;
|
private int arabicStart;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 阿拉伯数字结束位置
|
* 阿拉伯数字结束位置
|
||||||
*/
|
*/
|
||||||
|
@ -91,18 +91,18 @@ class LetterSegmenter implements ISegmenter {
|
||||||
*/
|
*/
|
||||||
public void analyze(AnalyzeContext context) {
|
public void analyze(AnalyzeContext context) {
|
||||||
boolean bufferLockFlag;
|
boolean bufferLockFlag;
|
||||||
//处理英文字母
|
// 处理英文字母
|
||||||
bufferLockFlag = this.processEnglishLetter(context);
|
bufferLockFlag = this.processEnglishLetter(context);
|
||||||
//处理阿拉伯字母
|
// 处理阿拉伯字母
|
||||||
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
|
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
|
||||||
//处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
|
// 处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
|
||||||
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
|
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
|
||||||
|
|
||||||
//判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
if (bufferLockFlag) {
|
if (bufferLockFlag) {
|
||||||
context.lockBuffer(SEGMENTER_NAME);
|
context.lockBuffer(SEGMENTER_NAME);
|
||||||
} else {
|
} else {
|
||||||
//对缓冲区解锁
|
// 对缓冲区解锁
|
||||||
context.unlockBuffer(SEGMENTER_NAME);
|
context.unlockBuffer(SEGMENTER_NAME);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -128,26 +128,26 @@ class LetterSegmenter implements ISegmenter {
|
||||||
private boolean processMixLetter(AnalyzeContext context) {
|
private boolean processMixLetter(AnalyzeContext context) {
|
||||||
boolean needLock;
|
boolean needLock;
|
||||||
|
|
||||||
if (this.start == -1) {//当前的分词器尚未开始处理字符
|
if (this.start == -1) {// 当前的分词器尚未开始处理字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
||||||
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
//记录起始指针的位置,标明分词器进入处理状态
|
// 记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.start = context.getCursor();
|
this.start = context.getCursor();
|
||||||
this.end = start;
|
this.end = start;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {//当前的分词器正在处理字符
|
} else {// 当前的分词器正在处理字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
||||||
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
//记录下可能的结束位置
|
// 记录下可能的结束位置
|
||||||
this.end = context.getCursor();
|
this.end = context.getCursor();
|
||||||
|
|
||||||
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
||||||
&& this.isLetterConnector(context.getCurrentChar())) {
|
&& this.isLetterConnector(context.getCurrentChar())) {
|
||||||
//记录下可能的结束位置
|
// 记录下可能的结束位置
|
||||||
this.end = context.getCursor();
|
this.end = context.getCursor();
|
||||||
} else {
|
} else {
|
||||||
//遇到非Letter字符,输出词元
|
// 遇到非Letter字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.start = -1;
|
this.start = -1;
|
||||||
|
@ -155,10 +155,10 @@ class LetterSegmenter implements ISegmenter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//判断缓冲区是否已经读完
|
// 判断缓冲区是否已经读完
|
||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.start != -1 && this.end != -1) {
|
if (this.start != -1 && this.end != -1) {
|
||||||
//缓冲以读完,输出词元
|
// 缓冲以读完,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.start = -1;
|
this.start = -1;
|
||||||
|
@ -166,7 +166,7 @@ class LetterSegmenter implements ISegmenter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
needLock = this.start != -1 || this.end != -1;
|
needLock = this.start != -1 || this.end != -1;
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
|
@ -179,18 +179,18 @@ class LetterSegmenter implements ISegmenter {
|
||||||
private boolean processEnglishLetter(AnalyzeContext context) {
|
private boolean processEnglishLetter(AnalyzeContext context) {
|
||||||
boolean needLock;
|
boolean needLock;
|
||||||
|
|
||||||
if (this.englishStart == -1) {//当前的分词器尚未开始处理英文字符
|
if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符
|
||||||
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
//记录起始指针的位置,标明分词器进入处理状态
|
// 记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.englishStart = context.getCursor();
|
this.englishStart = context.getCursor();
|
||||||
this.englishEnd = this.englishStart;
|
this.englishEnd = this.englishStart;
|
||||||
}
|
}
|
||||||
} else {//当前的分词器正在处理英文字符
|
} else {// 当前的分词器正在处理英文字符
|
||||||
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
//记录当前指针位置为结束位置
|
// 记录当前指针位置为结束位置
|
||||||
this.englishEnd = context.getCursor();
|
this.englishEnd = context.getCursor();
|
||||||
} else {
|
} else {
|
||||||
//遇到非English字符,输出词元
|
// 遇到非English字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.englishStart = -1;
|
this.englishStart = -1;
|
||||||
|
@ -198,10 +198,10 @@ class LetterSegmenter implements ISegmenter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//判断缓冲区是否已经读完
|
// 判断缓冲区是否已经读完
|
||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.englishStart != -1 && this.englishEnd != -1) {
|
if (this.englishStart != -1 && this.englishEnd != -1) {
|
||||||
//缓冲以读完,输出词元
|
// 缓冲以读完,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.englishStart = -1;
|
this.englishStart = -1;
|
||||||
|
@ -209,7 +209,7 @@ class LetterSegmenter implements ISegmenter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
needLock = this.englishStart != -1 || this.englishEnd != -1;
|
needLock = this.englishStart != -1 || this.englishEnd != -1;
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
|
@ -222,21 +222,21 @@ class LetterSegmenter implements ISegmenter {
|
||||||
private boolean processArabicLetter(AnalyzeContext context) {
|
private boolean processArabicLetter(AnalyzeContext context) {
|
||||||
boolean needLock;
|
boolean needLock;
|
||||||
|
|
||||||
if (this.arabicStart == -1) {//当前的分词器尚未开始处理数字字符
|
if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
||||||
//记录起始指针的位置,标明分词器进入处理状态
|
// 记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.arabicStart = context.getCursor();
|
this.arabicStart = context.getCursor();
|
||||||
this.arabicEnd = this.arabicStart;
|
this.arabicEnd = this.arabicStart;
|
||||||
}
|
}
|
||||||
} else {//当前的分词器正在处理数字字符
|
} else {// 当前的分词器正在处理数字字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
||||||
//记录当前指针位置为结束位置
|
// 记录当前指针位置为结束位置
|
||||||
this.arabicEnd = context.getCursor();
|
this.arabicEnd = context.getCursor();
|
||||||
}/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
}/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
||||||
&& this.isNumConnector(context.getCurrentChar())) {
|
&& this.isNumConnector(context.getCurrentChar())) {
|
||||||
//不输出数字,但不标记结束
|
// 不输出数字,但不标记结束
|
||||||
}*/ else {
|
}*/ else {
|
||||||
////遇到非Arabic字符,输出词元
|
// //遇到非Arabic字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.arabicStart = -1;
|
this.arabicStart = -1;
|
||||||
|
@ -244,10 +244,10 @@ class LetterSegmenter implements ISegmenter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//判断缓冲区是否已经读完
|
// 判断缓冲区是否已经读完
|
||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.arabicStart != -1 && this.arabicEnd != -1) {
|
if (this.arabicStart != -1 && this.arabicEnd != -1) {
|
||||||
//生成已切分的词元
|
// 生成已切分的词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.arabicStart = -1;
|
this.arabicStart = -1;
|
||||||
|
@ -255,7 +255,7 @@ class LetterSegmenter implements ISegmenter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
|
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue