日志格式优化;
This commit is contained in:
parent
7149c54de7
commit
3ec8076730
|
@ -39,39 +39,39 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
class AnalyzeContext {
|
class AnalyzeContext {
|
||||||
|
|
||||||
//默认缓冲区大小
|
// 默认缓冲区大小
|
||||||
private static final int BUFF_SIZE = 4096;
|
private static final int BUFF_SIZE = 4096;
|
||||||
//缓冲区耗尽的临界值
|
// 缓冲区耗尽的临界值
|
||||||
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
||||||
|
|
||||||
|
|
||||||
//字符窜读取缓冲
|
// 字符窜读取缓冲
|
||||||
private char[] segmentBuff;
|
private char[] segmentBuff;
|
||||||
//字符类型数组
|
// 字符类型数组
|
||||||
private int[] charTypes;
|
private int[] charTypes;
|
||||||
|
|
||||||
|
|
||||||
//记录Reader内已分析的字串总长度
|
// 记录Reader内已分析的字串总长度
|
||||||
//在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
|
// 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
|
||||||
private int buffOffset;
|
private int buffOffset;
|
||||||
//当前缓冲区位置指针
|
// 当前缓冲区位置指针
|
||||||
private int cursor;
|
private int cursor;
|
||||||
//最近一次读入的,可处理的字串长度
|
// 最近一次读入的,可处理的字串长度
|
||||||
private int available;
|
private int available;
|
||||||
|
|
||||||
|
|
||||||
//子分词器锁
|
// 子分词器锁
|
||||||
//该集合非空,说明有子分词器在占用segmentBuff
|
// 该集合非空,说明有子分词器在占用segmentBuff
|
||||||
private final Set<String> buffLocker;
|
private final Set<String> buffLocker;
|
||||||
|
|
||||||
//原始分词结果集合,未经歧义处理
|
// 原始分词结果集合,未经歧义处理
|
||||||
private QuickSortSet orgLexemes;
|
private QuickSortSet orgLexemes;
|
||||||
//LexemePath位置索引表
|
// LexemePath位置索引表
|
||||||
private final Map<Integer, LexemePath> pathMap;
|
private final Map<Integer, LexemePath> pathMap;
|
||||||
//最终分词结果集
|
// 最终分词结果集
|
||||||
private final LinkedList<Lexeme> results;
|
private final LinkedList<Lexeme> results;
|
||||||
|
|
||||||
//分词器配置项
|
// 分词器配置项
|
||||||
private final Configuration cfg;
|
private final Configuration cfg;
|
||||||
|
|
||||||
AnalyzeContext(Configuration cfg) {
|
AnalyzeContext(Configuration cfg) {
|
||||||
|
@ -113,21 +113,21 @@ class AnalyzeContext {
|
||||||
int fillBuffer(Reader reader) throws IOException {
|
int fillBuffer(Reader reader) throws IOException {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
if (this.buffOffset == 0) {
|
if (this.buffOffset == 0) {
|
||||||
//首次读取reader
|
// 首次读取reader
|
||||||
readCount = reader.read(segmentBuff);
|
readCount = reader.read(segmentBuff);
|
||||||
} else {
|
} else {
|
||||||
int offset = this.available - this.cursor;
|
int offset = this.available - this.cursor;
|
||||||
if (offset > 0) {
|
if (offset > 0) {
|
||||||
//最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
|
// 最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
|
||||||
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
|
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
|
||||||
readCount = offset;
|
readCount = offset;
|
||||||
}
|
}
|
||||||
//继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
|
// 继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
|
||||||
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
|
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
|
||||||
}
|
}
|
||||||
//记录最后一次从Reader中读入的可用字符长度
|
// 记录最后一次从Reader中读入的可用字符长度
|
||||||
this.available = readCount;
|
this.available = readCount;
|
||||||
//重置当前指针
|
// 重置当前指针
|
||||||
this.cursor = 0;
|
this.cursor = 0;
|
||||||
return readCount;
|
return readCount;
|
||||||
}
|
}
|
||||||
|
@ -251,35 +251,35 @@ class AnalyzeContext {
|
||||||
void outputToResult() {
|
void outputToResult() {
|
||||||
int index = 0;
|
int index = 0;
|
||||||
while (index <= this.cursor) {
|
while (index <= this.cursor) {
|
||||||
//跳过非CJK字符
|
// 跳过非CJK字符
|
||||||
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
||||||
index++;
|
index++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
//从pathMap找出对应index位置的LexemePath
|
// 从pathMap找出对应index位置的LexemePath
|
||||||
LexemePath path = this.pathMap.get(index);
|
LexemePath path = this.pathMap.get(index);
|
||||||
if (path != null) {
|
if (path != null) {
|
||||||
//输出LexemePath中的lexeme到results集合
|
// 输出LexemePath中的lexeme到results集合
|
||||||
Lexeme l = path.pollFirst();
|
Lexeme l = path.pollFirst();
|
||||||
while (l != null) {
|
while (l != null) {
|
||||||
this.results.add(l);
|
this.results.add(l);
|
||||||
//将index移至lexeme后
|
// 将index移至lexeme后
|
||||||
index = l.getBegin() + l.getLength();
|
index = l.getBegin() + l.getLength();
|
||||||
l = path.pollFirst();
|
l = path.pollFirst();
|
||||||
if (l != null) {
|
if (l != null) {
|
||||||
//输出path内部,词元间遗漏的单字
|
// 输出path内部,词元间遗漏的单字
|
||||||
for (; index < l.getBegin(); index++) {
|
for (; index < l.getBegin(); index++) {
|
||||||
this.outputSingleCJK(index);
|
this.outputSingleCJK(index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {//pathMap中找不到index对应的LexemePath
|
} else {// pathMap中找不到index对应的LexemePath
|
||||||
//单字输出
|
// 单字输出
|
||||||
this.outputSingleCJK(index);
|
this.outputSingleCJK(index);
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//清空当前的Map
|
// 清空当前的Map
|
||||||
this.pathMap.clear();
|
this.pathMap.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -304,16 +304,16 @@ class AnalyzeContext {
|
||||||
* 同时处理合并
|
* 同时处理合并
|
||||||
*/
|
*/
|
||||||
Lexeme getNextLexeme() {
|
Lexeme getNextLexeme() {
|
||||||
//从结果集取出,并移除第一个Lexme
|
// 从结果集取出,并移除第一个Lexme
|
||||||
Lexeme result = this.results.pollFirst();
|
Lexeme result = this.results.pollFirst();
|
||||||
while (result != null) {
|
while (result != null) {
|
||||||
//数量词合并
|
// 数量词合并
|
||||||
this.compound(result);
|
this.compound(result);
|
||||||
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
|
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
|
||||||
//是停止词继续取列表的下一个
|
// 是停止词继续取列表的下一个
|
||||||
result = this.results.pollFirst();
|
result = this.results.pollFirst();
|
||||||
} else {
|
} else {
|
||||||
//不是停止词, 生成lexeme的词元文本,输出
|
// 不是停止词, 生成lexeme的词元文本,输出
|
||||||
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
|
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -343,7 +343,7 @@ class AnalyzeContext {
|
||||||
if (!this.cfg.useSmart()) {
|
if (!this.cfg.useSmart()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
//数量词合并处理
|
// 数量词合并处理
|
||||||
if (!this.results.isEmpty()) {
|
if (!this.results.isEmpty()) {
|
||||||
|
|
||||||
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
|
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
|
||||||
|
@ -351,29 +351,29 @@ class AnalyzeContext {
|
||||||
boolean appendOk = false;
|
boolean appendOk = false;
|
||||||
if (nextLexeme != null) {
|
if (nextLexeme != null) {
|
||||||
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
|
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
|
||||||
//合并英文数词+中文数词
|
// 合并英文数词+中文数词
|
||||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
|
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
|
||||||
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
||||||
//合并英文数词+中文量词
|
// 合并英文数词+中文量词
|
||||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (appendOk) {
|
if (appendOk) {
|
||||||
//弹出
|
// 弹出
|
||||||
this.results.pollFirst();
|
this.results.pollFirst();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//可能存在第二轮合并
|
// 可能存在第二轮合并
|
||||||
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
|
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
|
||||||
Lexeme nextLexeme = this.results.peekFirst();
|
Lexeme nextLexeme = this.results.peekFirst();
|
||||||
boolean appendOk = false;
|
boolean appendOk = false;
|
||||||
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
||||||
//合并中文数词+中文量词
|
// 合并中文数词+中文量词
|
||||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
||||||
}
|
}
|
||||||
if (appendOk) {
|
if (appendOk) {
|
||||||
//弹出
|
// 弹出
|
||||||
this.results.pollFirst();
|
this.results.pollFirst();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue