Compare commits
No commits in common. "master" and "v8.5.0" have entirely different histories.
@ -6,6 +6,7 @@ ik-analyzer for solr 7.x-8.x
|
|||||||
[](https://github.com/magese/ik-analyzer-solr/releases)
|
[](https://github.com/magese/ik-analyzer-solr/releases)
|
||||||
[](./LICENSE)
|
[](./LICENSE)
|
||||||
[](https://travis-ci.org/magese/ik-analyzer-solr)
|
[](https://travis-ci.org/magese/ik-analyzer-solr)
|
||||||
|
[](http://hits.dwyl.io/magese/ik-analyzer-solr)
|
||||||
|
|
||||||
[](https://github.com/magese/ik-analyzer-solr/network/members)
|
[](https://github.com/magese/ik-analyzer-solr/network/members)
|
||||||
[](https://github.com/magese/ik-analyzer-solr/stargazers)
|
[](https://github.com/magese/ik-analyzer-solr/stargazers)
|
||||||
|
@ -76,7 +76,7 @@ public interface Configuration {
|
|||||||
*
|
*
|
||||||
* @return String 量词词典路径
|
* @return String 量词词典路径
|
||||||
*/
|
*/
|
||||||
String getQuantifierDictionary();
|
String getQuantifierDicionary();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取扩展字典配置路径
|
* 获取扩展字典配置路径
|
||||||
|
@ -145,7 +145,7 @@ public class DefaultConfig implements Configuration {
|
|||||||
*
|
*
|
||||||
* @return String 量词词典路径
|
* @return String 量词词典路径
|
||||||
*/
|
*/
|
||||||
public String getQuantifierDictionary() {
|
public String getQuantifierDicionary() {
|
||||||
return PATH_DIC_QUANTIFIER;
|
return PATH_DIC_QUANTIFIER;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,39 +39,39 @@ import java.util.*;
|
|||||||
*/
|
*/
|
||||||
class AnalyzeContext {
|
class AnalyzeContext {
|
||||||
|
|
||||||
// 默认缓冲区大小
|
//默认缓冲区大小
|
||||||
private static final int BUFF_SIZE = 4096;
|
private static final int BUFF_SIZE = 4096;
|
||||||
// 缓冲区耗尽的临界值
|
//缓冲区耗尽的临界值
|
||||||
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
||||||
|
|
||||||
|
|
||||||
// 字符窜读取缓冲
|
//字符窜读取缓冲
|
||||||
private char[] segmentBuff;
|
private char[] segmentBuff;
|
||||||
// 字符类型数组
|
//字符类型数组
|
||||||
private int[] charTypes;
|
private int[] charTypes;
|
||||||
|
|
||||||
|
|
||||||
// 记录Reader内已分析的字串总长度
|
//记录Reader内已分析的字串总长度
|
||||||
// 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
|
//在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
|
||||||
private int buffOffset;
|
private int buffOffset;
|
||||||
// 当前缓冲区位置指针
|
//当前缓冲区位置指针
|
||||||
private int cursor;
|
private int cursor;
|
||||||
// 最近一次读入的,可处理的字串长度
|
//最近一次读入的,可处理的字串长度
|
||||||
private int available;
|
private int available;
|
||||||
|
|
||||||
|
|
||||||
// 子分词器锁
|
//子分词器锁
|
||||||
// 该集合非空,说明有子分词器在占用segmentBuff
|
//该集合非空,说明有子分词器在占用segmentBuff
|
||||||
private final Set<String> buffLocker;
|
private final Set<String> buffLocker;
|
||||||
|
|
||||||
// 原始分词结果集合,未经歧义处理
|
//原始分词结果集合,未经歧义处理
|
||||||
private QuickSortSet orgLexemes;
|
private QuickSortSet orgLexemes;
|
||||||
// LexemePath位置索引表
|
//LexemePath位置索引表
|
||||||
private final Map<Integer, LexemePath> pathMap;
|
private final Map<Integer, LexemePath> pathMap;
|
||||||
// 最终分词结果集
|
//最终分词结果集
|
||||||
private final LinkedList<Lexeme> results;
|
private final LinkedList<Lexeme> results;
|
||||||
|
|
||||||
// 分词器配置项
|
//分词器配置项
|
||||||
private final Configuration cfg;
|
private final Configuration cfg;
|
||||||
|
|
||||||
AnalyzeContext(Configuration cfg) {
|
AnalyzeContext(Configuration cfg) {
|
||||||
@ -113,21 +113,21 @@ class AnalyzeContext {
|
|||||||
int fillBuffer(Reader reader) throws IOException {
|
int fillBuffer(Reader reader) throws IOException {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
if (this.buffOffset == 0) {
|
if (this.buffOffset == 0) {
|
||||||
// 首次读取reader
|
//首次读取reader
|
||||||
readCount = reader.read(segmentBuff);
|
readCount = reader.read(segmentBuff);
|
||||||
} else {
|
} else {
|
||||||
int offset = this.available - this.cursor;
|
int offset = this.available - this.cursor;
|
||||||
if (offset > 0) {
|
if (offset > 0) {
|
||||||
// 最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
|
//最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
|
||||||
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
|
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
|
||||||
readCount = offset;
|
readCount = offset;
|
||||||
}
|
}
|
||||||
// 继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
|
//继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
|
||||||
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
|
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
|
||||||
}
|
}
|
||||||
// 记录最后一次从Reader中读入的可用字符长度
|
//记录最后一次从Reader中读入的可用字符长度
|
||||||
this.available = readCount;
|
this.available = readCount;
|
||||||
// 重置当前指针
|
//重置当前指针
|
||||||
this.cursor = 0;
|
this.cursor = 0;
|
||||||
return readCount;
|
return readCount;
|
||||||
}
|
}
|
||||||
@ -251,35 +251,35 @@ class AnalyzeContext {
|
|||||||
void outputToResult() {
|
void outputToResult() {
|
||||||
int index = 0;
|
int index = 0;
|
||||||
while (index <= this.cursor) {
|
while (index <= this.cursor) {
|
||||||
// 跳过非CJK字符
|
//跳过非CJK字符
|
||||||
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
||||||
index++;
|
index++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// 从pathMap找出对应index位置的LexemePath
|
//从pathMap找出对应index位置的LexemePath
|
||||||
LexemePath path = this.pathMap.get(index);
|
LexemePath path = this.pathMap.get(index);
|
||||||
if (path != null) {
|
if (path != null) {
|
||||||
// 输出LexemePath中的lexeme到results集合
|
//输出LexemePath中的lexeme到results集合
|
||||||
Lexeme l = path.pollFirst();
|
Lexeme l = path.pollFirst();
|
||||||
while (l != null) {
|
while (l != null) {
|
||||||
this.results.add(l);
|
this.results.add(l);
|
||||||
// 将index移至lexeme后
|
//将index移至lexeme后
|
||||||
index = l.getBegin() + l.getLength();
|
index = l.getBegin() + l.getLength();
|
||||||
l = path.pollFirst();
|
l = path.pollFirst();
|
||||||
if (l != null) {
|
if (l != null) {
|
||||||
// 输出path内部,词元间遗漏的单字
|
//输出path内部,词元间遗漏的单字
|
||||||
for (; index < l.getBegin(); index++) {
|
for (; index < l.getBegin(); index++) {
|
||||||
this.outputSingleCJK(index);
|
this.outputSingleCJK(index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {// pathMap中找不到index对应的LexemePath
|
} else {//pathMap中找不到index对应的LexemePath
|
||||||
// 单字输出
|
//单字输出
|
||||||
this.outputSingleCJK(index);
|
this.outputSingleCJK(index);
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 清空当前的Map
|
//清空当前的Map
|
||||||
this.pathMap.clear();
|
this.pathMap.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -304,16 +304,16 @@ class AnalyzeContext {
|
|||||||
* 同时处理合并
|
* 同时处理合并
|
||||||
*/
|
*/
|
||||||
Lexeme getNextLexeme() {
|
Lexeme getNextLexeme() {
|
||||||
// 从结果集取出,并移除第一个Lexme
|
//从结果集取出,并移除第一个Lexme
|
||||||
Lexeme result = this.results.pollFirst();
|
Lexeme result = this.results.pollFirst();
|
||||||
while (result != null) {
|
while (result != null) {
|
||||||
// 数量词合并
|
//数量词合并
|
||||||
this.compound(result);
|
this.compound(result);
|
||||||
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
|
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
|
||||||
// 是停止词继续取列表的下一个
|
//是停止词继续取列表的下一个
|
||||||
result = this.results.pollFirst();
|
result = this.results.pollFirst();
|
||||||
} else {
|
} else {
|
||||||
// 不是停止词, 生成lexeme的词元文本,输出
|
//不是停止词, 生成lexeme的词元文本,输出
|
||||||
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
|
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -343,7 +343,7 @@ class AnalyzeContext {
|
|||||||
if (!this.cfg.useSmart()) {
|
if (!this.cfg.useSmart()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// 数量词合并处理
|
//数量词合并处理
|
||||||
if (!this.results.isEmpty()) {
|
if (!this.results.isEmpty()) {
|
||||||
|
|
||||||
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
|
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
|
||||||
@ -351,29 +351,29 @@ class AnalyzeContext {
|
|||||||
boolean appendOk = false;
|
boolean appendOk = false;
|
||||||
if (nextLexeme != null) {
|
if (nextLexeme != null) {
|
||||||
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
|
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
|
||||||
// 合并英文数词+中文数词
|
//合并英文数词+中文数词
|
||||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
|
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
|
||||||
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
||||||
// 合并英文数词+中文量词
|
//合并英文数词+中文量词
|
||||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (appendOk) {
|
if (appendOk) {
|
||||||
// 弹出
|
//弹出
|
||||||
this.results.pollFirst();
|
this.results.pollFirst();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 可能存在第二轮合并
|
//可能存在第二轮合并
|
||||||
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
|
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
|
||||||
Lexeme nextLexeme = this.results.peekFirst();
|
Lexeme nextLexeme = this.results.peekFirst();
|
||||||
boolean appendOk = false;
|
boolean appendOk = false;
|
||||||
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
|
||||||
// 合并中文数词+中文量词
|
//合并中文数词+中文量词
|
||||||
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
|
||||||
}
|
}
|
||||||
if (appendOk) {
|
if (appendOk) {
|
||||||
// 弹出
|
//弹出
|
||||||
this.results.pollFirst();
|
this.results.pollFirst();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -27,25 +27,25 @@
|
|||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.core;
|
package org.wltea.analyzer.core;
|
||||||
|
|
||||||
import org.wltea.analyzer.dic.Dictionary;
|
|
||||||
import org.wltea.analyzer.dic.Hit;
|
|
||||||
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.wltea.analyzer.dic.Dictionary;
|
||||||
|
import org.wltea.analyzer.dic.Hit;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 中文-日韩文子分词器
|
* 中文-日韩文子分词器
|
||||||
*/
|
*/
|
||||||
class CJKSegmenter implements ISegmenter {
|
class CJKSegmenter implements ISegmenter {
|
||||||
|
|
||||||
// 子分词器标签
|
//子分词器标签
|
||||||
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
|
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
|
||||||
// 待处理的分词hit队列
|
//待处理的分词hit队列
|
||||||
private final List<Hit> tmpHits;
|
private List<Hit> tmpHits;
|
||||||
|
|
||||||
|
|
||||||
CJKSegmenter() {
|
CJKSegmenter(){
|
||||||
this.tmpHits = new LinkedList<>();
|
this.tmpHits = new LinkedList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,65 +53,66 @@ class CJKSegmenter implements ISegmenter {
|
|||||||
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
|
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
|
||||||
*/
|
*/
|
||||||
public void analyze(AnalyzeContext context) {
|
public void analyze(AnalyzeContext context) {
|
||||||
if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
|
if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
|
||||||
|
|
||||||
// 优先处理tmpHits中的hit
|
//优先处理tmpHits中的hit
|
||||||
if (!this.tmpHits.isEmpty()) {
|
if(!this.tmpHits.isEmpty()){
|
||||||
// 处理词段队列
|
//处理词段队列
|
||||||
Hit[] tmpArray = this.tmpHits.toArray(new Hit[0]);
|
Hit[] tmpArray = this.tmpHits.toArray(new Hit[0]);
|
||||||
for (Hit hit : tmpArray) {
|
for(Hit hit : tmpArray){
|
||||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
|
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
|
||||||
if (hit.isMatch()) {
|
if(hit.isMatch()){
|
||||||
// 输出当前的词
|
//输出当前的词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_CNWORD);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
|
|
||||||
if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除
|
if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
|
||||||
this.tmpHits.remove(hit);
|
this.tmpHits.remove(hit);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (hit.isUnmatch()) {
|
}else if(hit.isUnmatch()){
|
||||||
// hit不是词,移除
|
//hit不是词,移除
|
||||||
this.tmpHits.remove(hit);
|
this.tmpHits.remove(hit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// *********************************
|
//*********************************
|
||||||
// 再对当前指针位置的字符进行单字匹配
|
//再对当前指针位置的字符进行单字匹配
|
||||||
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
|
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||||
|
if(singleCharHit.isMatch()){//首字成词
|
||||||
// 首字为词前缀
|
//输出当前的词
|
||||||
if (singleCharHit.isMatch()) {
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
|
||||||
// 输出当前的词
|
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
|
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
}
|
|
||||||
|
|
||||||
// 前缀匹配则放入hit列表
|
//同时也是词前缀
|
||||||
if (singleCharHit.isPrefix()) {
|
if(singleCharHit.isPrefix()){
|
||||||
// 前缀匹配则放入hit列表
|
//前缀匹配则放入hit列表
|
||||||
|
this.tmpHits.add(singleCharHit);
|
||||||
|
}
|
||||||
|
}else if(singleCharHit.isPrefix()){//首字为词前缀
|
||||||
|
//前缀匹配则放入hit列表
|
||||||
this.tmpHits.add(singleCharHit);
|
this.tmpHits.add(singleCharHit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else {
|
}else{
|
||||||
// 遇到CHAR_USELESS字符
|
//遇到CHAR_USELESS字符
|
||||||
// 清空队列
|
//清空队列
|
||||||
this.tmpHits.clear();
|
this.tmpHits.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断缓冲区是否已经读完
|
//判断缓冲区是否已经读完
|
||||||
if (context.isBufferConsumed()) {
|
if(context.isBufferConsumed()){
|
||||||
// 清空队列
|
//清空队列
|
||||||
this.tmpHits.clear();
|
this.tmpHits.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
//判断是否锁定缓冲区
|
||||||
if (this.tmpHits.size() == 0) {
|
if(this.tmpHits.size() == 0){
|
||||||
context.unlockBuffer(SEGMENTER_NAME);
|
context.unlockBuffer(SEGMENTER_NAME);
|
||||||
|
|
||||||
} else {
|
}else{
|
||||||
context.lockBuffer(SEGMENTER_NAME);
|
context.lockBuffer(SEGMENTER_NAME);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -120,7 +121,7 @@ class CJKSegmenter implements ISegmenter {
|
|||||||
* @see org.wltea.analyzer.core.ISegmenter#reset()
|
* @see org.wltea.analyzer.core.ISegmenter#reset()
|
||||||
*/
|
*/
|
||||||
public void reset() {
|
public void reset() {
|
||||||
// 清空队列
|
//清空队列
|
||||||
this.tmpHits.clear();
|
this.tmpHits.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -36,21 +36,22 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
*
|
||||||
* 中文数量词子分词器
|
* 中文数量词子分词器
|
||||||
*/
|
*/
|
||||||
class CN_QuantifierSegmenter implements ISegmenter {
|
class CN_QuantifierSegmenter implements ISegmenter{
|
||||||
|
|
||||||
// 子分词器标签
|
//子分词器标签
|
||||||
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
|
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
|
||||||
|
|
||||||
private static final Set<Character> CHN_NUMBER_CHARS = new HashSet<>();
|
private static Set<Character> ChnNumberChars = new HashSet<>();
|
||||||
|
static{
|
||||||
static {
|
//中文数词
|
||||||
// 中文数词
|
//Cnum
|
||||||
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
|
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
|
||||||
char[] ca = chn_Num.toCharArray();
|
char[] ca = chn_Num.toCharArray();
|
||||||
for (char nChar : ca) {
|
for(char nChar : ca){
|
||||||
CHN_NUMBER_CHARS.add(nChar);
|
ChnNumberChars.add(nChar);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,11 +67,11 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
*/
|
*/
|
||||||
private int nEnd;
|
private int nEnd;
|
||||||
|
|
||||||
// 待处理的量词hit队列
|
//待处理的量词hit队列
|
||||||
private final List<Hit> countHits;
|
private List<Hit> countHits;
|
||||||
|
|
||||||
|
|
||||||
CN_QuantifierSegmenter() {
|
CN_QuantifierSegmenter(){
|
||||||
nStart = -1;
|
nStart = -1;
|
||||||
nEnd = -1;
|
nEnd = -1;
|
||||||
this.countHits = new LinkedList<>();
|
this.countHits = new LinkedList<>();
|
||||||
@ -80,16 +81,16 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
* 分词
|
* 分词
|
||||||
*/
|
*/
|
||||||
public void analyze(AnalyzeContext context) {
|
public void analyze(AnalyzeContext context) {
|
||||||
// 处理中文数词
|
//处理中文数词
|
||||||
this.processCNumber(context);
|
this.processCNumber(context);
|
||||||
// 处理中文量词
|
//处理中文量词
|
||||||
this.processCount(context);
|
this.processCount(context);
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
//判断是否锁定缓冲区
|
||||||
if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
|
if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
|
||||||
// 对缓冲区解锁
|
//对缓冲区解锁
|
||||||
context.unlockBuffer(SEGMENTER_NAME);
|
context.unlockBuffer(SEGMENTER_NAME);
|
||||||
} else {
|
}else{
|
||||||
context.lockBuffer(SEGMENTER_NAME);
|
context.lockBuffer(SEGMENTER_NAME);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -107,34 +108,34 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
/**
|
/**
|
||||||
* 处理数词
|
* 处理数词
|
||||||
*/
|
*/
|
||||||
private void processCNumber(AnalyzeContext context) {
|
private void processCNumber(AnalyzeContext context){
|
||||||
if (nStart == -1 && nEnd == -1) {// 初始状态
|
if(nStart == -1 && nEnd == -1){//初始状态
|
||||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||||
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
|
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||||
// 记录数词的起始、结束位置
|
//记录数词的起始、结束位置
|
||||||
nStart = context.getCursor();
|
nStart = context.getCursor();
|
||||||
nEnd = context.getCursor();
|
nEnd = context.getCursor();
|
||||||
}
|
}
|
||||||
} else {// 正在处理状态
|
}else{//正在处理状态
|
||||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||||
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
|
&& ChnNumberChars.contains(context.getCurrentChar())){
|
||||||
// 记录数词的结束位置
|
//记录数词的结束位置
|
||||||
nEnd = context.getCursor();
|
nEnd = context.getCursor();
|
||||||
} else {
|
}else{
|
||||||
// 输出数词
|
//输出数词
|
||||||
this.outputNumLexeme(context);
|
this.outputNumLexeme(context);
|
||||||
// 重置头尾指针
|
//重置头尾指针
|
||||||
nStart = -1;
|
nStart = -1;
|
||||||
nEnd = -1;
|
nEnd = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 缓冲区已经用完,还有尚未输出的数词
|
//缓冲区已经用完,还有尚未输出的数词
|
||||||
if (context.isBufferConsumed()) {
|
if(context.isBufferConsumed()){
|
||||||
if (nStart != -1 && nEnd != -1) {
|
if(nStart != -1 && nEnd != -1){
|
||||||
// 输出数词
|
//输出数词
|
||||||
outputNumLexeme(context);
|
outputNumLexeme(context);
|
||||||
// 重置头尾指针
|
//重置头尾指针
|
||||||
nStart = -1;
|
nStart = -1;
|
||||||
nEnd = -1;
|
nEnd = -1;
|
||||||
}
|
}
|
||||||
@ -143,65 +144,66 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 处理中文量词
|
* 处理中文量词
|
||||||
*
|
|
||||||
* @param context 需要处理的内容
|
* @param context 需要处理的内容
|
||||||
*/
|
*/
|
||||||
private void processCount(AnalyzeContext context) {
|
private void processCount(AnalyzeContext context){
|
||||||
// 判断是否需要启动量词扫描
|
// 判断是否需要启动量词扫描
|
||||||
if (!this.needCountScan(context)) {
|
if(!this.needCountScan(context)){
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
|
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
|
||||||
|
|
||||||
// 优先处理countHits中的hit
|
//优先处理countHits中的hit
|
||||||
if (!this.countHits.isEmpty()) {
|
if(!this.countHits.isEmpty()){
|
||||||
// 处理词段队列
|
//处理词段队列
|
||||||
Hit[] tmpArray = this.countHits.toArray(new Hit[0]);
|
Hit[] tmpArray = this.countHits.toArray(new Hit[0]);
|
||||||
for (Hit hit : tmpArray) {
|
for(Hit hit : tmpArray){
|
||||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
|
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
|
||||||
if (hit.isMatch()) {
|
if(hit.isMatch()){
|
||||||
// 输出当前的词
|
//输出当前的词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
|
|
||||||
if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除
|
if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
|
||||||
this.countHits.remove(hit);
|
this.countHits.remove(hit);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (hit.isUnmatch()) {
|
}else if(hit.isUnmatch()){
|
||||||
// hit不是词,移除
|
//hit不是词,移除
|
||||||
this.countHits.remove(hit);
|
this.countHits.remove(hit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// *********************************
|
//*********************************
|
||||||
// 对当前指针位置的字符进行单字匹配
|
//对当前指针位置的字符进行单字匹配
|
||||||
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
|
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||||
|
if(singleCharHit.isMatch()){//首字成量词词
|
||||||
// 首字为量词前缀
|
//输出当前的词
|
||||||
if (singleCharHit.isMatch()) {
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
|
||||||
// 输出当前的词
|
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
|
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
}
|
|
||||||
|
|
||||||
// 前缀匹配则放入hit列表
|
//同时也是词前缀
|
||||||
if (singleCharHit.isPrefix()) {
|
if(singleCharHit.isPrefix()){
|
||||||
// 前缀匹配则放入hit列表
|
//前缀匹配则放入hit列表
|
||||||
|
this.countHits.add(singleCharHit);
|
||||||
|
}
|
||||||
|
}else if(singleCharHit.isPrefix()){//首字为量词前缀
|
||||||
|
//前缀匹配则放入hit列表
|
||||||
this.countHits.add(singleCharHit);
|
this.countHits.add(singleCharHit);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
|
||||||
// 输入的不是中文字符
|
}else{
|
||||||
// 清空未成形的量词
|
//输入的不是中文字符
|
||||||
|
//清空未成形的量词
|
||||||
this.countHits.clear();
|
this.countHits.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 缓冲区数据已经读完,还有尚未输出的量词
|
//缓冲区数据已经读完,还有尚未输出的量词
|
||||||
if (context.isBufferConsumed()) {
|
if(context.isBufferConsumed()){
|
||||||
// 清空未成形的量词
|
//清空未成形的量词
|
||||||
this.countHits.clear();
|
this.countHits.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -209,15 +211,15 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
/**
|
/**
|
||||||
* 判断是否需要扫描量词
|
* 判断是否需要扫描量词
|
||||||
*/
|
*/
|
||||||
private boolean needCountScan(AnalyzeContext context) {
|
private boolean needCountScan(AnalyzeContext context){
|
||||||
if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
|
if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
|
||||||
// 正在处理中文数词,或者正在处理量词
|
//正在处理中文数词,或者正在处理量词
|
||||||
return true;
|
return true;
|
||||||
} else {
|
}else{
|
||||||
// 找到一个相邻的数词
|
//找到一个相邻的数词
|
||||||
if (!context.getOrgLexemes().isEmpty()) {
|
if(!context.getOrgLexemes().isEmpty()){
|
||||||
Lexeme l = context.getOrgLexemes().peekLast();
|
Lexeme l = context.getOrgLexemes().peekLast();
|
||||||
if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
|
if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
|
||||||
return l.getBegin() + l.getLength() == context.getCursor();
|
return l.getBegin() + l.getLength() == context.getCursor();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -227,13 +229,12 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 添加数词词元到结果集
|
* 添加数词词元到结果集
|
||||||
*
|
|
||||||
* @param context 需要添加的词元
|
* @param context 需要添加的词元
|
||||||
*/
|
*/
|
||||||
private void outputNumLexeme(AnalyzeContext context) {
|
private void outputNumLexeme(AnalyzeContext context){
|
||||||
if (nStart > -1 && nEnd > -1) {
|
if(nStart > -1 && nEnd > -1){
|
||||||
// 输出数词
|
//输出数词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
package org.wltea.analyzer.core;
|
package org.wltea.analyzer.core;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
*
|
||||||
* 字符集识别工具类
|
* 字符集识别工具类
|
||||||
*/
|
*/
|
||||||
class CharacterUtil {
|
class CharacterUtil {
|
||||||
@ -45,28 +46,27 @@ class CharacterUtil {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 识别字符类型
|
* 识别字符类型
|
||||||
*
|
|
||||||
* @param input 需要识别的字符
|
* @param input 需要识别的字符
|
||||||
* @return int CharacterUtil定义的字符类型常量
|
* @return int CharacterUtil定义的字符类型常量
|
||||||
*/
|
*/
|
||||||
static int identifyCharType(char input) {
|
static int identifyCharType(char input){
|
||||||
if (input >= '0' && input <= '9') {
|
if(input >= '0' && input <= '9'){
|
||||||
return CHAR_ARABIC;
|
return CHAR_ARABIC;
|
||||||
|
|
||||||
} else if ((input >= 'a' && input <= 'z')
|
}else if((input >= 'a' && input <= 'z')
|
||||||
|| (input >= 'A' && input <= 'Z')) {
|
|| (input >= 'A' && input <= 'Z')){
|
||||||
return CHAR_ENGLISH;
|
return CHAR_ENGLISH;
|
||||||
|
|
||||||
} else {
|
}else {
|
||||||
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
|
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
|
||||||
|
|
||||||
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|
||||||
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|
||||||
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
|
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
|
||||||
//目前已知的中文字符UTF-8集合
|
//目前已知的中文字符UTF-8集合
|
||||||
return CHAR_CHINESE;
|
return CHAR_CHINESE;
|
||||||
|
|
||||||
} else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
|
}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
|
||||||
//韩文字符集
|
//韩文字符集
|
||||||
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|
||||||
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|
||||||
@ -74,7 +74,7 @@ class CharacterUtil {
|
|||||||
//日文字符集
|
//日文字符集
|
||||||
|| ub == Character.UnicodeBlock.HIRAGANA //平假名
|
|| ub == Character.UnicodeBlock.HIRAGANA //平假名
|
||||||
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|
||||||
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
|
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
|
||||||
return CHAR_OTHER_CJK;
|
return CHAR_OTHER_CJK;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -85,18 +85,17 @@ class CharacterUtil {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 进行字符规格化(全角转半角,大写转小写处理)
|
* 进行字符规格化(全角转半角,大写转小写处理)
|
||||||
*
|
|
||||||
* @param input 需要转换的字符
|
* @param input 需要转换的字符
|
||||||
* @return char
|
* @return char
|
||||||
*/
|
*/
|
||||||
static char regularize(char input) {
|
static char regularize(char input){
|
||||||
if (input == 12288) {
|
if (input == 12288) {
|
||||||
input = (char) 32;
|
input = (char) 32;
|
||||||
|
|
||||||
} else if (input > 65280 && input < 65375) {
|
}else if (input > 65280 && input < 65375) {
|
||||||
input = (char) (input - 65248);
|
input = (char) (input - 65248);
|
||||||
|
|
||||||
} else if (input >= 'A' && input <= 'Z') {
|
}else if (input >= 'A' && input <= 'Z') {
|
||||||
input += 32;
|
input += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,7 +35,9 @@ import java.util.TreeSet;
|
|||||||
*/
|
*/
|
||||||
class IKArbitrator {
|
class IKArbitrator {
|
||||||
|
|
||||||
IKArbitrator() {}
|
IKArbitrator() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 分词歧义处理
|
* 分词歧义处理
|
||||||
@ -50,20 +52,20 @@ class IKArbitrator {
|
|||||||
LexemePath crossPath = new LexemePath();
|
LexemePath crossPath = new LexemePath();
|
||||||
while (orgLexeme != null) {
|
while (orgLexeme != null) {
|
||||||
if (!crossPath.addCrossLexeme(orgLexeme)) {
|
if (!crossPath.addCrossLexeme(orgLexeme)) {
|
||||||
// 找到与crossPath不相交的下一个crossPath
|
//找到与crossPath不相交的下一个crossPath
|
||||||
if (crossPath.size() == 1 || !useSmart) {
|
if (crossPath.size() == 1 || !useSmart) {
|
||||||
// crossPath没有歧义 或者 不做歧义处理
|
//crossPath没有歧义 或者 不做歧义处理
|
||||||
// 直接输出当前crossPath
|
//直接输出当前crossPath
|
||||||
context.addLexemePath(crossPath);
|
context.addLexemePath(crossPath);
|
||||||
} else {
|
} else {
|
||||||
// 对当前的crossPath进行歧义处理
|
//对当前的crossPath进行歧义处理
|
||||||
QuickSortSet.Cell headCell = crossPath.getHead();
|
QuickSortSet.Cell headCell = crossPath.getHead();
|
||||||
LexemePath judgeResult = this.judge(headCell);
|
LexemePath judgeResult = this.judge(headCell);
|
||||||
// 输出歧义处理结果judgeResult
|
//输出歧义处理结果judgeResult
|
||||||
context.addLexemePath(judgeResult);
|
context.addLexemePath(judgeResult);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 把orgLexeme加入新的crossPath中
|
//把orgLexeme加入新的crossPath中
|
||||||
crossPath = new LexemePath();
|
crossPath = new LexemePath();
|
||||||
crossPath.addCrossLexeme(orgLexeme);
|
crossPath.addCrossLexeme(orgLexeme);
|
||||||
}
|
}
|
||||||
@ -71,16 +73,16 @@ class IKArbitrator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// 处理最后的path
|
//处理最后的path
|
||||||
if (crossPath.size() == 1 || !useSmart) {
|
if (crossPath.size() == 1 || !useSmart) {
|
||||||
// crossPath没有歧义 或者 不做歧义处理
|
//crossPath没有歧义 或者 不做歧义处理
|
||||||
// 直接输出当前crossPath
|
//直接输出当前crossPath
|
||||||
context.addLexemePath(crossPath);
|
context.addLexemePath(crossPath);
|
||||||
} else {
|
} else {
|
||||||
// 对当前的crossPath进行歧义处理
|
//对当前的crossPath进行歧义处理
|
||||||
QuickSortSet.Cell headCell = crossPath.getHead();
|
QuickSortSet.Cell headCell = crossPath.getHead();
|
||||||
LexemePath judgeResult = this.judge(headCell);
|
LexemePath judgeResult = this.judge(headCell);
|
||||||
// 输出歧义处理结果judgeResult
|
//输出歧义处理结果judgeResult
|
||||||
context.addLexemePath(judgeResult);
|
context.addLexemePath(judgeResult);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -91,29 +93,29 @@ class IKArbitrator {
|
|||||||
* @param lexemeCell 歧义路径链表头
|
* @param lexemeCell 歧义路径链表头
|
||||||
*/
|
*/
|
||||||
private LexemePath judge(QuickSortSet.Cell lexemeCell) {
|
private LexemePath judge(QuickSortSet.Cell lexemeCell) {
|
||||||
// 候选路径集合
|
//候选路径集合
|
||||||
TreeSet<LexemePath> pathOptions = new TreeSet<>();
|
TreeSet<LexemePath> pathOptions = new TreeSet<>();
|
||||||
// 候选结果路径
|
//候选结果路径
|
||||||
LexemePath option = new LexemePath();
|
LexemePath option = new LexemePath();
|
||||||
|
|
||||||
// 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
|
//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
|
||||||
Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell, option);
|
Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell, option);
|
||||||
|
|
||||||
// 当前词元链并非最理想的,加入候选路径集合
|
//当前词元链并非最理想的,加入候选路径集合
|
||||||
pathOptions.add(option.copy());
|
pathOptions.add(option.copy());
|
||||||
|
|
||||||
// 存在歧义词,处理
|
//存在歧义词,处理
|
||||||
QuickSortSet.Cell c;
|
QuickSortSet.Cell c;
|
||||||
while (!lexemeStack.isEmpty()) {
|
while (!lexemeStack.isEmpty()) {
|
||||||
c = lexemeStack.pop();
|
c = lexemeStack.pop();
|
||||||
// 回滚词元链
|
//回滚词元链
|
||||||
this.backPath(c.getLexeme(), option);
|
this.backPath(c.getLexeme(), option);
|
||||||
// 从歧义词位置开始,递归,生成可选方案
|
//从歧义词位置开始,递归,生成可选方案
|
||||||
this.forwardPath(c, option);
|
this.forwardPath(c, option);
|
||||||
pathOptions.add(option.copy());
|
pathOptions.add(option.copy());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 返回集合中的最优方案
|
//返回集合中的最优方案
|
||||||
return pathOptions.first();
|
return pathOptions.first();
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -122,13 +124,13 @@ class IKArbitrator {
|
|||||||
* 向前遍历,添加词元,构造一个无歧义词元组合
|
* 向前遍历,添加词元,构造一个无歧义词元组合
|
||||||
*/
|
*/
|
||||||
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
|
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
|
||||||
// 发生冲突的Lexeme栈
|
//发生冲突的Lexeme栈
|
||||||
Stack<QuickSortSet.Cell> conflictStack = new Stack<>();
|
Stack<QuickSortSet.Cell> conflictStack = new Stack<>();
|
||||||
QuickSortSet.Cell c = lexemeCell;
|
QuickSortSet.Cell c = lexemeCell;
|
||||||
// 迭代遍历Lexeme链表
|
//迭代遍历Lexeme链表
|
||||||
while (c != null && c.getLexeme() != null) {
|
while (c != null && c.getLexeme() != null) {
|
||||||
if (!option.addNotCrossLexeme(c.getLexeme())) {
|
if (!option.addNotCrossLexeme(c.getLexeme())) {
|
||||||
// 词元交叉,添加失败则加入lexemeStack栈
|
//词元交叉,添加失败则加入lexemeStack栈
|
||||||
conflictStack.push(c);
|
conflictStack.push(c);
|
||||||
}
|
}
|
||||||
c = c.getNext();
|
c = c.getNext();
|
||||||
|
@ -41,25 +41,15 @@ import java.util.List;
|
|||||||
*/
|
*/
|
||||||
public final class IKSegmenter {
|
public final class IKSegmenter {
|
||||||
|
|
||||||
/**
|
//字符窜reader
|
||||||
* 字符窜reader
|
|
||||||
*/
|
|
||||||
private Reader input;
|
private Reader input;
|
||||||
/**
|
//分词器配置项
|
||||||
* 分词器配置项
|
private Configuration cfg;
|
||||||
*/
|
//分词器上下文
|
||||||
private final Configuration cfg;
|
|
||||||
/**
|
|
||||||
* 分词器上下文
|
|
||||||
*/
|
|
||||||
private AnalyzeContext context;
|
private AnalyzeContext context;
|
||||||
/**
|
//分词处理器列表
|
||||||
* 分词处理器列表
|
|
||||||
*/
|
|
||||||
private List<ISegmenter> segmenters;
|
private List<ISegmenter> segmenters;
|
||||||
/**
|
//分词歧义裁决器
|
||||||
* 分词歧义裁决器
|
|
||||||
*/
|
|
||||||
private IKArbitrator arbitrator;
|
private IKArbitrator arbitrator;
|
||||||
|
|
||||||
|
|
||||||
@ -95,13 +85,13 @@ public final class IKSegmenter {
|
|||||||
* 初始化
|
* 初始化
|
||||||
*/
|
*/
|
||||||
private void init() {
|
private void init() {
|
||||||
// 初始化词典单例
|
//初始化词典单例
|
||||||
Dictionary.initial(this.cfg);
|
Dictionary.initial(this.cfg);
|
||||||
// 初始化分词上下文
|
//初始化分词上下文
|
||||||
this.context = new AnalyzeContext(this.cfg);
|
this.context = new AnalyzeContext(this.cfg);
|
||||||
// 加载子分词器
|
//加载子分词器
|
||||||
this.segmenters = this.loadSegmenters();
|
this.segmenters = this.loadSegmenters();
|
||||||
// 加载歧义裁决器
|
//加载歧义裁决器
|
||||||
this.arbitrator = new IKArbitrator();
|
this.arbitrator = new IKArbitrator();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,11 +102,11 @@ public final class IKSegmenter {
|
|||||||
*/
|
*/
|
||||||
private List<ISegmenter> loadSegmenters() {
|
private List<ISegmenter> loadSegmenters() {
|
||||||
List<ISegmenter> segmenters = new ArrayList<>(4);
|
List<ISegmenter> segmenters = new ArrayList<>(4);
|
||||||
// 处理字母的子分词器
|
//处理字母的子分词器
|
||||||
segmenters.add(new LetterSegmenter());
|
segmenters.add(new LetterSegmenter());
|
||||||
// 处理中文数量词的子分词器
|
//处理中文数量词的子分词器
|
||||||
segmenters.add(new CN_QuantifierSegmenter());
|
segmenters.add(new CN_QuantifierSegmenter());
|
||||||
// 处理中文词的子分词器
|
//处理中文词的子分词器
|
||||||
segmenters.add(new CJKSegmenter());
|
segmenters.add(new CJKSegmenter());
|
||||||
return segmenters;
|
return segmenters;
|
||||||
}
|
}
|
||||||
@ -136,34 +126,34 @@ public final class IKSegmenter {
|
|||||||
*/
|
*/
|
||||||
int available = context.fillBuffer(this.input);
|
int available = context.fillBuffer(this.input);
|
||||||
if (available <= 0) {
|
if (available <= 0) {
|
||||||
// reader已经读完
|
//reader已经读完
|
||||||
context.reset();
|
context.reset();
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// 初始化指针
|
//初始化指针
|
||||||
context.initCursor();
|
context.initCursor();
|
||||||
do {
|
do {
|
||||||
// 遍历子分词器
|
//遍历子分词器
|
||||||
for (ISegmenter segmenter : segmenters) {
|
for (ISegmenter segmenter : segmenters) {
|
||||||
segmenter.analyze(context);
|
segmenter.analyze(context);
|
||||||
}
|
}
|
||||||
// 字符缓冲区接近读完,需要读入新的字符
|
//字符缓冲区接近读完,需要读入新的字符
|
||||||
if (context.needRefillBuffer()) {
|
if (context.needRefillBuffer()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// 向前移动指针
|
//向前移动指针
|
||||||
} while (context.moveCursor());
|
} while (context.moveCursor());
|
||||||
// 重置子分词器,为下轮循环进行初始化
|
//重置子分词器,为下轮循环进行初始化
|
||||||
for (ISegmenter segmenter : segmenters) {
|
for (ISegmenter segmenter : segmenters) {
|
||||||
segmenter.reset();
|
segmenter.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 对分词进行歧义处理
|
//对分词进行歧义处理
|
||||||
this.arbitrator.process(context, this.cfg.useSmart());
|
this.arbitrator.process(context, this.cfg.useSmart());
|
||||||
// 将分词结果输出到结果集,并处理未切分的单个CJK字符
|
//将分词结果输出到结果集,并处理未切分的单个CJK字符
|
||||||
context.outputToResult();
|
context.outputToResult();
|
||||||
// 记录本次分词的缓冲区位移
|
//记录本次分词的缓冲区位移
|
||||||
context.markBufferOffset();
|
context.markBufferOffset();
|
||||||
}
|
}
|
||||||
return l;
|
return l;
|
||||||
|
@ -29,13 +29,13 @@ package org.wltea.analyzer.core;
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
*
|
||||||
* 子分词器接口
|
* 子分词器接口
|
||||||
*/
|
*/
|
||||||
interface ISegmenter {
|
interface ISegmenter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 从分析器读取下一个可能分解的词元对象
|
* 从分析器读取下一个可能分解的词元对象
|
||||||
*
|
|
||||||
* @param context 分词算法上下文
|
* @param context 分词算法上下文
|
||||||
*/
|
*/
|
||||||
void analyze(AnalyzeContext context);
|
void analyze(AnalyzeContext context);
|
||||||
|
@ -34,18 +34,14 @@ import java.util.Arrays;
|
|||||||
*/
|
*/
|
||||||
class LetterSegmenter implements ISegmenter {
|
class LetterSegmenter implements ISegmenter {
|
||||||
|
|
||||||
/**
|
//子分词器标签
|
||||||
* 子分词器标签
|
|
||||||
*/
|
|
||||||
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
|
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
|
||||||
/**
|
//链接符号
|
||||||
* 链接符号
|
|
||||||
*/
|
|
||||||
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
|
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
|
||||||
/**
|
|
||||||
* 数字符号
|
//数字符号
|
||||||
*/
|
|
||||||
private static final char[] Num_Connector = new char[]{',', '.'};
|
private static final char[] Num_Connector = new char[]{',', '.'};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 词元的开始位置,
|
* 词元的开始位置,
|
||||||
* 同时作为子分词器状态标识
|
* 同时作为子分词器状态标识
|
||||||
@ -57,18 +53,22 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
|
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
|
||||||
*/
|
*/
|
||||||
private int end;
|
private int end;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 字母起始位置
|
* 字母起始位置
|
||||||
*/
|
*/
|
||||||
private int englishStart;
|
private int englishStart;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 字母结束位置
|
* 字母结束位置
|
||||||
*/
|
*/
|
||||||
private int englishEnd;
|
private int englishEnd;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 阿拉伯数字起始位置
|
* 阿拉伯数字起始位置
|
||||||
*/
|
*/
|
||||||
private int arabicStart;
|
private int arabicStart;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 阿拉伯数字结束位置
|
* 阿拉伯数字结束位置
|
||||||
*/
|
*/
|
||||||
@ -91,18 +91,18 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
*/
|
*/
|
||||||
public void analyze(AnalyzeContext context) {
|
public void analyze(AnalyzeContext context) {
|
||||||
boolean bufferLockFlag;
|
boolean bufferLockFlag;
|
||||||
// 处理英文字母
|
//处理英文字母
|
||||||
bufferLockFlag = this.processEnglishLetter(context);
|
bufferLockFlag = this.processEnglishLetter(context);
|
||||||
// 处理阿拉伯字母
|
//处理阿拉伯字母
|
||||||
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
|
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
|
||||||
// 处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
|
//处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
|
||||||
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
|
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
//判断是否锁定缓冲区
|
||||||
if (bufferLockFlag) {
|
if (bufferLockFlag) {
|
||||||
context.lockBuffer(SEGMENTER_NAME);
|
context.lockBuffer(SEGMENTER_NAME);
|
||||||
} else {
|
} else {
|
||||||
// 对缓冲区解锁
|
//对缓冲区解锁
|
||||||
context.unlockBuffer(SEGMENTER_NAME);
|
context.unlockBuffer(SEGMENTER_NAME);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,26 +128,26 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
private boolean processMixLetter(AnalyzeContext context) {
|
private boolean processMixLetter(AnalyzeContext context) {
|
||||||
boolean needLock;
|
boolean needLock;
|
||||||
|
|
||||||
if (this.start == -1) {// 当前的分词器尚未开始处理字符
|
if (this.start == -1) {//当前的分词器尚未开始处理字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
||||||
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录起始指针的位置,标明分词器进入处理状态
|
//记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.start = context.getCursor();
|
this.start = context.getCursor();
|
||||||
this.end = start;
|
this.end = start;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {// 当前的分词器正在处理字符
|
} else {//当前的分词器正在处理字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
||||||
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录下可能的结束位置
|
//记录下可能的结束位置
|
||||||
this.end = context.getCursor();
|
this.end = context.getCursor();
|
||||||
|
|
||||||
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
||||||
&& this.isLetterConnector(context.getCurrentChar())) {
|
&& this.isLetterConnector(context.getCurrentChar())) {
|
||||||
// 记录下可能的结束位置
|
//记录下可能的结束位置
|
||||||
this.end = context.getCursor();
|
this.end = context.getCursor();
|
||||||
} else {
|
} else {
|
||||||
// 遇到非Letter字符,输出词元
|
//遇到非Letter字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.start = -1;
|
this.start = -1;
|
||||||
@ -155,10 +155,10 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断缓冲区是否已经读完
|
//判断缓冲区是否已经读完
|
||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.start != -1 && this.end != -1) {
|
if (this.start != -1 && this.end != -1) {
|
||||||
// 缓冲以读完,输出词元
|
//缓冲以读完,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.start = -1;
|
this.start = -1;
|
||||||
@ -166,7 +166,7 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
//判断是否锁定缓冲区
|
||||||
needLock = this.start != -1 || this.end != -1;
|
needLock = this.start != -1 || this.end != -1;
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
@ -179,18 +179,18 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
private boolean processEnglishLetter(AnalyzeContext context) {
|
private boolean processEnglishLetter(AnalyzeContext context) {
|
||||||
boolean needLock;
|
boolean needLock;
|
||||||
|
|
||||||
if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符
|
if (this.englishStart == -1) {//当前的分词器尚未开始处理英文字符
|
||||||
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录起始指针的位置,标明分词器进入处理状态
|
//记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.englishStart = context.getCursor();
|
this.englishStart = context.getCursor();
|
||||||
this.englishEnd = this.englishStart;
|
this.englishEnd = this.englishStart;
|
||||||
}
|
}
|
||||||
} else {// 当前的分词器正在处理英文字符
|
} else {//当前的分词器正在处理英文字符
|
||||||
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录当前指针位置为结束位置
|
//记录当前指针位置为结束位置
|
||||||
this.englishEnd = context.getCursor();
|
this.englishEnd = context.getCursor();
|
||||||
} else {
|
} else {
|
||||||
// 遇到非English字符,输出词元
|
//遇到非English字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.englishStart = -1;
|
this.englishStart = -1;
|
||||||
@ -198,10 +198,10 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断缓冲区是否已经读完
|
//判断缓冲区是否已经读完
|
||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.englishStart != -1 && this.englishEnd != -1) {
|
if (this.englishStart != -1 && this.englishEnd != -1) {
|
||||||
// 缓冲以读完,输出词元
|
//缓冲以读完,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.englishStart = -1;
|
this.englishStart = -1;
|
||||||
@ -209,7 +209,7 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
//判断是否锁定缓冲区
|
||||||
needLock = this.englishStart != -1 || this.englishEnd != -1;
|
needLock = this.englishStart != -1 || this.englishEnd != -1;
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
@ -222,21 +222,21 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
private boolean processArabicLetter(AnalyzeContext context) {
|
private boolean processArabicLetter(AnalyzeContext context) {
|
||||||
boolean needLock;
|
boolean needLock;
|
||||||
|
|
||||||
if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符
|
if (this.arabicStart == -1) {//当前的分词器尚未开始处理数字字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
||||||
// 记录起始指针的位置,标明分词器进入处理状态
|
//记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.arabicStart = context.getCursor();
|
this.arabicStart = context.getCursor();
|
||||||
this.arabicEnd = this.arabicStart;
|
this.arabicEnd = this.arabicStart;
|
||||||
}
|
}
|
||||||
} else {// 当前的分词器正在处理数字字符
|
} else {//当前的分词器正在处理数字字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
||||||
// 记录当前指针位置为结束位置
|
//记录当前指针位置为结束位置
|
||||||
this.arabicEnd = context.getCursor();
|
this.arabicEnd = context.getCursor();
|
||||||
}/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
}/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
|
||||||
&& this.isNumConnector(context.getCurrentChar())) {
|
&& this.isNumConnector(context.getCurrentChar())) {
|
||||||
// 不输出数字,但不标记结束
|
//不输出数字,但不标记结束
|
||||||
}*/ else {
|
}*/ else {
|
||||||
// //遇到非Arabic字符,输出词元
|
////遇到非Arabic字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.arabicStart = -1;
|
this.arabicStart = -1;
|
||||||
@ -244,10 +244,10 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断缓冲区是否已经读完
|
//判断缓冲区是否已经读完
|
||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.arabicStart != -1 && this.arabicEnd != -1) {
|
if (this.arabicStart != -1 && this.arabicEnd != -1) {
|
||||||
// 生成已切分的词元
|
//生成已切分的词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.arabicStart = -1;
|
this.arabicStart = -1;
|
||||||
@ -255,7 +255,7 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
//判断是否锁定缓冲区
|
||||||
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
|
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
|
@ -31,69 +31,42 @@ package org.wltea.analyzer.core;
|
|||||||
* IK词元对象
|
* IK词元对象
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
public class Lexeme implements Comparable<Lexeme> {
|
public class Lexeme implements Comparable<Lexeme>{
|
||||||
/**
|
//英文
|
||||||
* 英文
|
|
||||||
*/
|
|
||||||
static final int TYPE_ENGLISH = 1;
|
static final int TYPE_ENGLISH = 1;
|
||||||
/**
|
//数字
|
||||||
* 数字
|
|
||||||
*/
|
|
||||||
static final int TYPE_ARABIC = 2;
|
static final int TYPE_ARABIC = 2;
|
||||||
/**
|
//英文数字混合
|
||||||
* 英文数字混合
|
|
||||||
*/
|
|
||||||
static final int TYPE_LETTER = 3;
|
static final int TYPE_LETTER = 3;
|
||||||
/**
|
//中文词元
|
||||||
* 中文词元
|
|
||||||
*/
|
|
||||||
static final int TYPE_CNWORD = 4;
|
static final int TYPE_CNWORD = 4;
|
||||||
/**
|
//中文单字
|
||||||
* 中文单字
|
|
||||||
*/
|
|
||||||
static final int TYPE_CNCHAR = 64;
|
static final int TYPE_CNCHAR = 64;
|
||||||
/**
|
//日韩文字
|
||||||
* 日韩文字
|
|
||||||
*/
|
|
||||||
static final int TYPE_OTHER_CJK = 8;
|
static final int TYPE_OTHER_CJK = 8;
|
||||||
/**
|
//中文数词
|
||||||
* 中文数词
|
|
||||||
*/
|
|
||||||
static final int TYPE_CNUM = 16;
|
static final int TYPE_CNUM = 16;
|
||||||
/**
|
//中文量词
|
||||||
* 中文量词
|
|
||||||
*/
|
|
||||||
static final int TYPE_COUNT = 32;
|
static final int TYPE_COUNT = 32;
|
||||||
/**
|
//中文数量词
|
||||||
* 中文数量词
|
|
||||||
*/
|
|
||||||
static final int TYPE_CQUAN = 48;
|
static final int TYPE_CQUAN = 48;
|
||||||
/**
|
|
||||||
* 词元的起始位移
|
//词元的起始位移
|
||||||
*/
|
|
||||||
private int offset;
|
private int offset;
|
||||||
/**
|
//词元的相对起始位置
|
||||||
* 词元的相对起始位置
|
|
||||||
*/
|
|
||||||
private int begin;
|
private int begin;
|
||||||
/**
|
//词元的长度
|
||||||
* 词元的长度
|
|
||||||
*/
|
|
||||||
private int length;
|
private int length;
|
||||||
/**
|
//词元文本
|
||||||
* 词元文本
|
|
||||||
*/
|
|
||||||
private String lexemeText;
|
private String lexemeText;
|
||||||
/**
|
//词元类型
|
||||||
* 词元类型
|
|
||||||
*/
|
|
||||||
private int lexemeType;
|
private int lexemeType;
|
||||||
|
|
||||||
|
|
||||||
public Lexeme(int offset, int begin, int length, int lexemeType) {
|
public Lexeme(int offset , int begin , int length , int lexemeType){
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.begin = begin;
|
this.begin = begin;
|
||||||
if (length < 0) {
|
if(length < 0){
|
||||||
throw new IllegalArgumentException("length < 0");
|
throw new IllegalArgumentException("length < 0");
|
||||||
}
|
}
|
||||||
this.length = length;
|
this.length = length;
|
||||||
@ -105,21 +78,21 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
* 起始位置偏移、起始位置、终止位置相同
|
* 起始位置偏移、起始位置、终止位置相同
|
||||||
* @see java.lang.Object#equals(Object o)
|
* @see java.lang.Object#equals(Object o)
|
||||||
*/
|
*/
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o){
|
||||||
if (o == null) {
|
if(o == null){
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this == o) {
|
if(this == o){
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (o instanceof Lexeme) {
|
if(o instanceof Lexeme){
|
||||||
Lexeme other = (Lexeme) o;
|
Lexeme other = (Lexeme)o;
|
||||||
return this.offset == other.getOffset()
|
return this.offset == other.getOffset()
|
||||||
&& this.begin == other.getBegin()
|
&& this.begin == other.getBegin()
|
||||||
&& this.length == other.getLength();
|
&& this.length == other.getLength();
|
||||||
} else {
|
}else{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,7 +101,7 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
* 词元哈希编码算法
|
* 词元哈希编码算法
|
||||||
* @see java.lang.Object#hashCode()
|
* @see java.lang.Object#hashCode()
|
||||||
*/
|
*/
|
||||||
public int hashCode() {
|
public int hashCode(){
|
||||||
int absBegin = getBeginPosition();
|
int absBegin = getBeginPosition();
|
||||||
int absEnd = getEndPosition();
|
int absEnd = getEndPosition();
|
||||||
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
|
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
|
||||||
@ -139,15 +112,15 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
* @see java.lang.Comparable#compareTo(java.lang.Object)
|
* @see java.lang.Comparable#compareTo(java.lang.Object)
|
||||||
*/
|
*/
|
||||||
public int compareTo(Lexeme other) {
|
public int compareTo(Lexeme other) {
|
||||||
// 起始位置优先
|
//起始位置优先
|
||||||
if (this.begin < other.getBegin()) {
|
if(this.begin < other.getBegin()){
|
||||||
return -1;
|
return -1;
|
||||||
} else if (this.begin == other.getBegin()) {
|
}else if(this.begin == other.getBegin()){
|
||||||
// 词元长度优先
|
//词元长度优先
|
||||||
// this.length < other.getLength()
|
//this.length < other.getLength()
|
||||||
return Integer.compare(other.getLength(), this.length);
|
return Integer.compare(other.getLength(), this.length);
|
||||||
|
|
||||||
} else {
|
}else{//this.begin > other.getBegin()
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -163,13 +136,11 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
int getBegin() {
|
int getBegin() {
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词元在文本中的起始位置
|
* 获取词元在文本中的起始位置
|
||||||
*
|
|
||||||
* @return int
|
* @return int
|
||||||
*/
|
*/
|
||||||
public int getBeginPosition() {
|
public int getBeginPosition(){
|
||||||
return offset + begin;
|
return offset + begin;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,24 +150,22 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词元在文本中的结束位置
|
* 获取词元在文本中的结束位置
|
||||||
*
|
|
||||||
* @return int
|
* @return int
|
||||||
*/
|
*/
|
||||||
public int getEndPosition() {
|
public int getEndPosition(){
|
||||||
return offset + begin + length;
|
return offset + begin + length;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词元的字符长度
|
* 获取词元的字符长度
|
||||||
*
|
|
||||||
* @return int
|
* @return int
|
||||||
*/
|
*/
|
||||||
public int getLength() {
|
public int getLength(){
|
||||||
return this.length;
|
return this.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLength(int length) {
|
public void setLength(int length) {
|
||||||
if (this.length < 0) {
|
if(this.length < 0){
|
||||||
throw new IllegalArgumentException("length < 0");
|
throw new IllegalArgumentException("length < 0");
|
||||||
}
|
}
|
||||||
this.length = length;
|
this.length = length;
|
||||||
@ -204,21 +173,20 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词元的文本内容
|
* 获取词元的文本内容
|
||||||
*
|
|
||||||
* @return String
|
* @return String
|
||||||
*/
|
*/
|
||||||
public String getLexemeText() {
|
public String getLexemeText() {
|
||||||
if (lexemeText == null) {
|
if(lexemeText == null){
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
return lexemeText;
|
return lexemeText;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setLexemeText(String lexemeText) {
|
void setLexemeText(String lexemeText) {
|
||||||
if (lexemeText == null) {
|
if(lexemeText == null){
|
||||||
this.lexemeText = "";
|
this.lexemeText = "";
|
||||||
this.length = 0;
|
this.length = 0;
|
||||||
} else {
|
}else{
|
||||||
this.lexemeText = lexemeText;
|
this.lexemeText = lexemeText;
|
||||||
this.length = lexemeText.length();
|
this.length = lexemeText.length();
|
||||||
}
|
}
|
||||||
@ -226,7 +194,6 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词元类型
|
* 获取词元类型
|
||||||
*
|
|
||||||
* @return int
|
* @return int
|
||||||
*/
|
*/
|
||||||
int getLexemeType() {
|
int getLexemeType() {
|
||||||
@ -235,41 +202,40 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取词元类型标示字符串
|
* 获取词元类型标示字符串
|
||||||
*
|
|
||||||
* @return String
|
* @return String
|
||||||
*/
|
*/
|
||||||
public String getLexemeTypeString() {
|
public String getLexemeTypeString(){
|
||||||
switch (lexemeType) {
|
switch(lexemeType) {
|
||||||
|
|
||||||
case TYPE_ENGLISH:
|
case TYPE_ENGLISH :
|
||||||
return "ENGLISH";
|
return "ENGLISH";
|
||||||
|
|
||||||
case TYPE_ARABIC:
|
case TYPE_ARABIC :
|
||||||
return "ARABIC";
|
return "ARABIC";
|
||||||
|
|
||||||
case TYPE_LETTER:
|
case TYPE_LETTER :
|
||||||
return "LETTER";
|
return "LETTER";
|
||||||
|
|
||||||
case TYPE_CNWORD:
|
case TYPE_CNWORD :
|
||||||
return "CN_WORD";
|
return "CN_WORD";
|
||||||
|
|
||||||
case TYPE_CNCHAR:
|
case TYPE_CNCHAR :
|
||||||
return "CN_CHAR";
|
return "CN_CHAR";
|
||||||
|
|
||||||
case TYPE_OTHER_CJK:
|
case TYPE_OTHER_CJK :
|
||||||
return "OTHER_CJK";
|
return "OTHER_CJK";
|
||||||
|
|
||||||
case TYPE_COUNT:
|
case TYPE_COUNT :
|
||||||
return "COUNT";
|
return "COUNT";
|
||||||
|
|
||||||
case TYPE_CNUM:
|
case TYPE_CNUM :
|
||||||
return "TYPE_CNUM";
|
return "TYPE_CNUM";
|
||||||
|
|
||||||
case TYPE_CQUAN:
|
case TYPE_CQUAN:
|
||||||
return "TYPE_CQUAN";
|
return "TYPE_CQUAN";
|
||||||
|
|
||||||
default:
|
default :
|
||||||
return "UNKNOWN";
|
return "UNKONW";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,25 +246,23 @@ public class Lexeme implements Comparable<Lexeme> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 合并两个相邻的词元
|
* 合并两个相邻的词元
|
||||||
*
|
|
||||||
* @return boolean 词元是否成功合并
|
* @return boolean 词元是否成功合并
|
||||||
*/
|
*/
|
||||||
boolean append(Lexeme l, int lexemeType) {
|
boolean append(Lexeme l, int lexemeType){
|
||||||
if (l != null && this.getEndPosition() == l.getBeginPosition()) {
|
if(l != null && this.getEndPosition() == l.getBeginPosition()){
|
||||||
this.length += l.getLength();
|
this.length += l.getLength();
|
||||||
this.lexemeType = lexemeType;
|
this.lexemeType = lexemeType;
|
||||||
return true;
|
return true;
|
||||||
} else {
|
}else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ToString 方法
|
|
||||||
*
|
*
|
||||||
* @return 字符串输出
|
|
||||||
*/
|
*/
|
||||||
public String toString() {
|
public String toString(){
|
||||||
return this.getBeginPosition() + "-" + this.getEndPosition() +
|
return this.getBeginPosition() + "-" + this.getEndPosition() +
|
||||||
" : " + this.lexemeText + " : \t" +
|
" : " + this.lexemeText + " : \t" +
|
||||||
this.getLexemeTypeString();
|
this.getLexemeTypeString();
|
||||||
|
@ -34,17 +34,11 @@ package org.wltea.analyzer.core;
|
|||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
||||||
|
|
||||||
/**
|
//起始位置
|
||||||
* 起始位置
|
|
||||||
*/
|
|
||||||
private int pathBegin;
|
private int pathBegin;
|
||||||
/**
|
//结束
|
||||||
* 结束
|
|
||||||
*/
|
|
||||||
private int pathEnd;
|
private int pathEnd;
|
||||||
/**
|
//词元链的有效字符长度
|
||||||
* 词元链的有效字符长度
|
|
||||||
*/
|
|
||||||
private int payloadLength;
|
private int payloadLength;
|
||||||
|
|
||||||
LexemePath() {
|
LexemePath() {
|
||||||
@ -106,6 +100,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 移除尾部的Lexeme
|
* 移除尾部的Lexeme
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
void removeTail() {
|
void removeTail() {
|
||||||
Lexeme tail = this.pollLast();
|
Lexeme tail = this.pollLast();
|
||||||
@ -122,6 +117,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 检测词元位置交叉(有歧义的切分)
|
* 检测词元位置交叉(有歧义的切分)
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
boolean checkCross(Lexeme lexeme) {
|
boolean checkCross(Lexeme lexeme) {
|
||||||
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
|
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
|
||||||
@ -145,6 +141,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取LexemePath的路径长度
|
* 获取LexemePath的路径长度
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
private int getPathLength() {
|
private int getPathLength() {
|
||||||
return this.pathEnd - this.pathBegin;
|
return this.pathEnd - this.pathBegin;
|
||||||
@ -153,6 +150,7 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* X权重(词元长度积)
|
* X权重(词元长度积)
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
private int getXWeight() {
|
private int getXWeight() {
|
||||||
int product = 1;
|
int product = 1;
|
||||||
@ -193,48 +191,48 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int compareTo(LexemePath o) {
|
public int compareTo(LexemePath o) {
|
||||||
// 比较有效文本长度
|
//比较有效文本长度
|
||||||
if (this.payloadLength > o.payloadLength) {
|
if (this.payloadLength > o.payloadLength) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (this.payloadLength < o.payloadLength) {
|
} else if (this.payloadLength < o.payloadLength) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
} else {
|
||||||
|
//比较词元个数,越少越好
|
||||||
// 比较词元个数,越少越好
|
|
||||||
if (this.size() < o.size()) {
|
if (this.size() < o.size()) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (this.size() > o.size()) {
|
} else if (this.size() > o.size()) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
} else {
|
||||||
|
//路径跨度越大越好
|
||||||
// 路径跨度越大越好
|
|
||||||
if (this.getPathLength() > o.getPathLength()) {
|
if (this.getPathLength() > o.getPathLength()) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (this.getPathLength() < o.getPathLength()) {
|
} else if (this.getPathLength() < o.getPathLength()) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
} else {
|
||||||
|
//根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
|
||||||
// 根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
|
|
||||||
if (this.pathEnd > o.pathEnd) {
|
if (this.pathEnd > o.pathEnd) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (pathEnd < o.pathEnd) {
|
} else if (pathEnd < o.pathEnd) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
} else {
|
||||||
|
//词长越平均越好
|
||||||
// 词长越平均越好
|
|
||||||
if (this.getXWeight() > o.getXWeight()) {
|
if (this.getXWeight() > o.getXWeight()) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (this.getXWeight() < o.getXWeight()) {
|
} else if (this.getXWeight() < o.getXWeight()) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
} else {
|
||||||
|
//词元位置权重比较
|
||||||
// 词元位置权重比较
|
|
||||||
if (this.getPWeight() > o.getPWeight()) {
|
if (this.getPWeight() > o.getPWeight()) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (this.getPWeight() < o.getPWeight()) {
|
} else if (this.getPWeight() < o.getPWeight()) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,20 +28,14 @@
|
|||||||
package org.wltea.analyzer.core;
|
package org.wltea.analyzer.core;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器专用的Lexeme快速排序集合
|
* IK分词器专用的Lexem快速排序集合
|
||||||
*/
|
*/
|
||||||
class QuickSortSet {
|
class QuickSortSet {
|
||||||
/**
|
//链表头
|
||||||
* 链表头
|
|
||||||
*/
|
|
||||||
private Cell head;
|
private Cell head;
|
||||||
/**
|
//链表尾
|
||||||
* 链表尾
|
|
||||||
*/
|
|
||||||
private Cell tail;
|
private Cell tail;
|
||||||
/**
|
//链表的实际大小
|
||||||
* 链表的实际大小
|
|
||||||
*/
|
|
||||||
private int size;
|
private int size;
|
||||||
|
|
||||||
QuickSortSet() {
|
QuickSortSet() {
|
||||||
@ -59,29 +53,31 @@ class QuickSortSet {
|
|||||||
this.size++;
|
this.size++;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if (this.tail.compareTo(newCell) < 0) {
|
/*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合
|
||||||
// 词元接入链表尾部
|
|
||||||
|
}else */
|
||||||
|
if (this.tail.compareTo(newCell) < 0) {//词元接入链表尾部
|
||||||
this.tail.next = newCell;
|
this.tail.next = newCell;
|
||||||
newCell.prev = this.tail;
|
newCell.prev = this.tail;
|
||||||
this.tail = newCell;
|
this.tail = newCell;
|
||||||
this.size++;
|
this.size++;
|
||||||
|
|
||||||
} else if (this.head.compareTo(newCell) > 0) {
|
} else if (this.head.compareTo(newCell) > 0) {//词元接入链表头部
|
||||||
// 词元接入链表头部
|
|
||||||
this.head.prev = newCell;
|
this.head.prev = newCell;
|
||||||
newCell.next = this.head;
|
newCell.next = this.head;
|
||||||
this.head = newCell;
|
this.head = newCell;
|
||||||
this.size++;
|
this.size++;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// 从尾部上逆
|
//从尾部上逆
|
||||||
Cell index = this.tail;
|
Cell index = this.tail;
|
||||||
while (index != null && index.compareTo(newCell) > 0) {
|
while (index != null && index.compareTo(newCell) > 0) {
|
||||||
index = index.prev;
|
index = index.prev;
|
||||||
}
|
}
|
||||||
|
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
|
||||||
|
|
||||||
// 词元插入链表中的某个位置
|
}else */
|
||||||
if ((index != null ? index.compareTo(newCell) : 1) < 0) {
|
if ((index != null ? index.compareTo(newCell) : 1) < 0) {//词元插入链表中的某个位置
|
||||||
newCell.prev = index;
|
newCell.prev = index;
|
||||||
newCell.next = index.next;
|
newCell.next = index.next;
|
||||||
index.next.prev = newCell;
|
index.next.prev = newCell;
|
||||||
|
@ -37,38 +37,24 @@ import java.util.Map;
|
|||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
class DictSegment implements Comparable<DictSegment> {
|
class DictSegment implements Comparable<DictSegment> {
|
||||||
|
|
||||||
/**
|
//公用字典表,存储汉字
|
||||||
* 公用字典表,存储汉字
|
|
||||||
*/
|
|
||||||
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
|
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
|
||||||
/**
|
//数组大小上限
|
||||||
* 数组大小上限
|
|
||||||
*/
|
|
||||||
private static final int ARRAY_LENGTH_LIMIT = 3;
|
private static final int ARRAY_LENGTH_LIMIT = 3;
|
||||||
|
|
||||||
|
|
||||||
/**
|
//Map存储结构
|
||||||
* Map存储结构
|
private Map<Character, DictSegment> childrenMap;
|
||||||
*/
|
//数组方式存储结构
|
||||||
private volatile Map<Character, DictSegment> childrenMap;
|
private DictSegment[] childrenArray;
|
||||||
/**
|
|
||||||
* 数组方式存储结构
|
|
||||||
*/
|
|
||||||
private volatile DictSegment[] childrenArray;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
//当前节点上存储的字符
|
||||||
* 当前节点上存储的字符
|
private Character nodeChar;
|
||||||
*/
|
//当前节点存储的Segment数目
|
||||||
private final Character nodeChar;
|
//storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
|
||||||
/**
|
|
||||||
* 当前节点存储的Segment数目
|
|
||||||
* storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
|
|
||||||
*/
|
|
||||||
private int storeSize = 0;
|
private int storeSize = 0;
|
||||||
/**
|
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
|
||||||
* 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
|
|
||||||
*/
|
|
||||||
private int nodeState = 0;
|
private int nodeState = 0;
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,14 +27,14 @@
|
|||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.dic;
|
package org.wltea.analyzer.dic;
|
||||||
|
|
||||||
import org.wltea.analyzer.cfg.Configuration;
|
|
||||||
import org.wltea.analyzer.cfg.DefaultConfig;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.wltea.analyzer.cfg.Configuration;
|
||||||
|
import org.wltea.analyzer.cfg.DefaultConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 词典管理类,单例模式
|
* 词典管理类,单例模式
|
||||||
*/
|
*/
|
||||||
@ -44,7 +44,7 @@ public class Dictionary {
|
|||||||
/*
|
/*
|
||||||
* 词典单子实例
|
* 词典单子实例
|
||||||
*/
|
*/
|
||||||
private static volatile Dictionary singleton;
|
private static Dictionary singleton;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 主词典对象
|
* 主词典对象
|
||||||
@ -63,7 +63,7 @@ public class Dictionary {
|
|||||||
/**
|
/**
|
||||||
* 配置对象
|
* 配置对象
|
||||||
*/
|
*/
|
||||||
private final Configuration cfg;
|
private Configuration cfg;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 私有构造方法,阻止外部直接实例化本类
|
* 私有构造方法,阻止外部直接实例化本类
|
||||||
@ -326,7 +326,7 @@ public class Dictionary {
|
|||||||
// 建立一个量词典实例
|
// 建立一个量词典实例
|
||||||
_QuantifierDict = new DictSegment((char) 0);
|
_QuantifierDict = new DictSegment((char) 0);
|
||||||
// 读取量词词典文件
|
// 读取量词词典文件
|
||||||
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary());
|
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
|
||||||
if (is == null) {
|
if (is == null) {
|
||||||
throw new RuntimeException("Quantifier Dictionary not found!!!");
|
throw new RuntimeException("Quantifier Dictionary not found!!!");
|
||||||
}
|
}
|
||||||
|
@ -32,33 +32,24 @@ package org.wltea.analyzer.dic;
|
|||||||
*/
|
*/
|
||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
public class Hit {
|
public class Hit {
|
||||||
/**
|
//Hit不匹配
|
||||||
* Hit不匹配
|
|
||||||
*/
|
|
||||||
private static final int UNMATCH = 0x00000000;
|
private static final int UNMATCH = 0x00000000;
|
||||||
/**
|
//Hit完全匹配
|
||||||
* Hit完全匹配
|
|
||||||
*/
|
|
||||||
private static final int MATCH = 0x00000001;
|
private static final int MATCH = 0x00000001;
|
||||||
/**
|
//Hit前缀匹配
|
||||||
* Hit前缀匹配
|
|
||||||
*/
|
|
||||||
private static final int PREFIX = 0x00000010;
|
private static final int PREFIX = 0x00000010;
|
||||||
|
|
||||||
|
|
||||||
/**
|
//该HIT当前状态,默认未匹配
|
||||||
* 该HIT当前状态,默认未匹配
|
|
||||||
*/
|
|
||||||
private int hitState = UNMATCH;
|
private int hitState = UNMATCH;
|
||||||
/**
|
|
||||||
* 记录词典匹配过程中,当前匹配到的词典分支节点
|
//记录词典匹配过程中,当前匹配到的词典分支节点
|
||||||
*/
|
|
||||||
private DictSegment matchedDictSegment;
|
private DictSegment matchedDictSegment;
|
||||||
/**
|
/*
|
||||||
* 词段开始位置
|
* 词段开始位置
|
||||||
*/
|
*/
|
||||||
private int begin;
|
private int begin;
|
||||||
/**
|
/*
|
||||||
* 词段的结束位置
|
* 词段的结束位置
|
||||||
*/
|
*/
|
||||||
private int end;
|
private int end;
|
||||||
@ -95,7 +86,9 @@ public class Hit {
|
|||||||
public boolean isUnmatch() {
|
public boolean isUnmatch() {
|
||||||
return this.hitState == UNMATCH ;
|
return this.hitState == UNMATCH ;
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
void setUnmatch() {
|
void setUnmatch() {
|
||||||
this.hitState = UNMATCH;
|
this.hitState = UNMATCH;
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
public final class IKAnalyzer extends Analyzer {
|
public final class IKAnalyzer extends Analyzer {
|
||||||
|
|
||||||
private final boolean useSmart;
|
private boolean useSmart;
|
||||||
|
|
||||||
private boolean useSmart() {
|
private boolean useSmart() {
|
||||||
return useSmart;
|
return useSmart;
|
||||||
|
@ -39,30 +39,21 @@ import java.io.IOException;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器 Lucene Tokenizer适配器类
|
* IK分词器 Lucene Tokenizer适配器类
|
||||||
|
* 兼容Lucene 4.0版本
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings({"unused", "FinalMethodInFinalClass"})
|
@SuppressWarnings("unused")
|
||||||
public final class IKTokenizer extends Tokenizer {
|
public final class IKTokenizer extends Tokenizer {
|
||||||
|
|
||||||
/**
|
//IK分词器实现
|
||||||
* IK分词器实现
|
|
||||||
*/
|
|
||||||
private IKSegmenter _IKImplement;
|
private IKSegmenter _IKImplement;
|
||||||
|
|
||||||
/**
|
//词元文本属性
|
||||||
* 词元文本属性
|
|
||||||
*/
|
|
||||||
private CharTermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
/**
|
//词元位移属性
|
||||||
* 词元位移属性
|
|
||||||
*/
|
|
||||||
private OffsetAttribute offsetAtt;
|
private OffsetAttribute offsetAtt;
|
||||||
/**
|
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
||||||
* 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
|
|
||||||
*/
|
|
||||||
private TypeAttribute typeAtt;
|
private TypeAttribute typeAtt;
|
||||||
/**
|
//记录最后一个词元的结束位置
|
||||||
* 记录最后一个词元的结束位置
|
|
||||||
*/
|
|
||||||
private int endPosition;
|
private int endPosition;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -93,31 +84,30 @@ public final class IKTokenizer extends Tokenizer {
|
|||||||
_IKImplement = new IKSegmenter(input, useSmart);
|
_IKImplement = new IKSegmenter(input, useSmart);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* (non-Javadoc)
|
||||||
* (non-Javadoc)
|
|
||||||
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
|
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
// 清除所有的词元属性
|
//清除所有的词元属性
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
Lexeme nextLexeme = _IKImplement.next();
|
Lexeme nextLexeme = _IKImplement.next();
|
||||||
if (nextLexeme != null) {
|
if (nextLexeme != null) {
|
||||||
// 将Lexeme转成Attributes
|
//将Lexeme转成Attributes
|
||||||
// 设置词元文本
|
//设置词元文本
|
||||||
termAtt.append(nextLexeme.getLexemeText());
|
termAtt.append(nextLexeme.getLexemeText());
|
||||||
// 设置词元长度
|
//设置词元长度
|
||||||
termAtt.setLength(nextLexeme.getLength());
|
termAtt.setLength(nextLexeme.getLength());
|
||||||
// 设置词元位移
|
//设置词元位移
|
||||||
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
|
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
|
||||||
// 记录分词的最后位置
|
//记录分词的最后位置
|
||||||
endPosition = nextLexeme.getEndPosition();
|
endPosition = nextLexeme.getEndPosition();
|
||||||
// 记录词元分类
|
//记录词元分类
|
||||||
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
typeAtt.setType(nextLexeme.getLexemeTypeString());
|
||||||
// 返会true告知还有下个词元
|
//返会true告知还有下个词元
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// 返会false告知词元输出完毕
|
//返会false告知词元输出完毕
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,8 +21,8 @@
|
|||||||
* 版权声明 2012,乌龙茶工作室
|
* 版权声明 2012,乌龙茶工作室
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
* provided by Linliangyi and copyright 2012 by Oolong studio
|
||||||
*
|
*
|
||||||
* 8.5.0版本 由 Magese (magese@live.cn) 更新
|
* 8.3.1版本 由 Magese (magese@live.cn) 更新
|
||||||
* release 8.5.0 update by Magese(magese@live.cn)
|
* release 8.3.1 update by Magese(magese@live.cn)
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
package org.wltea.analyzer.lucene;
|
package org.wltea.analyzer.lucene;
|
||||||
@ -44,8 +44,6 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 分词器工厂类
|
|
||||||
*
|
|
||||||
* @author <a href="magese@live.cn">Magese</a>
|
* @author <a href="magese@live.cn">Magese</a>
|
||||||
*/
|
*/
|
||||||
public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateThread.UpdateJob {
|
public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateThread.UpdateJob {
|
||||||
|
@ -46,11 +46,11 @@ import java.util.Stack;
|
|||||||
public class IKQueryExpressionParser {
|
public class IKQueryExpressionParser {
|
||||||
|
|
||||||
|
|
||||||
private final List<Element> elements = new ArrayList<>();
|
private List<Element> elements = new ArrayList<>();
|
||||||
|
|
||||||
private final Stack<Query> querys = new Stack<>();
|
private Stack<Query> querys = new Stack<>();
|
||||||
|
|
||||||
private final Stack<Element> operates = new Stack<>();
|
private Stack<Element> operates = new Stack<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 解析查询表达式,生成Lucene Query对象
|
* 解析查询表达式,生成Lucene Query对象
|
||||||
@ -61,9 +61,9 @@ public class IKQueryExpressionParser {
|
|||||||
Query lucenceQuery = null;
|
Query lucenceQuery = null;
|
||||||
if (expression != null && !"".equals(expression.trim())) {
|
if (expression != null && !"".equals(expression.trim())) {
|
||||||
try {
|
try {
|
||||||
// 文法解析
|
//文法解析
|
||||||
this.splitElements(expression);
|
this.splitElements(expression);
|
||||||
// 语法解析
|
//语法解析
|
||||||
this.parseSyntax();
|
this.parseSyntax();
|
||||||
if (this.querys.size() == 1) {
|
if (this.querys.size() == 1) {
|
||||||
lucenceQuery = this.querys.pop();
|
lucenceQuery = this.querys.pop();
|
||||||
@ -87,263 +87,263 @@ public class IKQueryExpressionParser {
|
|||||||
if (expression == null) {
|
if (expression == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Element currentElement = null;
|
Element curretElement = null;
|
||||||
|
|
||||||
char[] expChars = expression.toCharArray();
|
char[] expChars = expression.toCharArray();
|
||||||
for (char expChar : expChars) {
|
for (char expChar : expChars) {
|
||||||
switch (expChar) {
|
switch (expChar) {
|
||||||
case '&':
|
case '&':
|
||||||
if (currentElement == null) {
|
if (curretElement == null) {
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '&';
|
curretElement.type = '&';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
} else if (currentElement.type == '&') {
|
} else if (curretElement.type == '&') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
} else if (currentElement.type == '\'') {
|
} else if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '&';
|
curretElement.type = '&';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '|':
|
case '|':
|
||||||
if (currentElement == null) {
|
if (curretElement == null) {
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '|';
|
curretElement.type = '|';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
} else if (currentElement.type == '|') {
|
} else if (curretElement.type == '|') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
} else if (currentElement.type == '\'') {
|
} else if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '|';
|
curretElement.type = '|';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '-':
|
case '-':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '-';
|
curretElement.type = '-';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '(':
|
case '(':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '(';
|
curretElement.type = '(';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ')':
|
case ')':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = ')';
|
curretElement.type = ')';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ':':
|
case ':':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = ':';
|
curretElement.type = ':';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '=':
|
case '=':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '=';
|
curretElement.type = '=';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ' ':
|
case ' ':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\'':
|
case '\'':
|
||||||
if (currentElement == null) {
|
if (curretElement == null) {
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '\'';
|
curretElement.type = '\'';
|
||||||
|
|
||||||
} else if (currentElement.type == '\'') {
|
} else if (curretElement.type == '\'') {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '\'';
|
curretElement.type = '\'';
|
||||||
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '[':
|
case '[':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '[';
|
curretElement.type = '[';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ']':
|
case ']':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = ']';
|
curretElement.type = ']';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '{':
|
case '{':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '{';
|
curretElement.type = '{';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '}':
|
case '}':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = '}';
|
curretElement.type = '}';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case ',':
|
case ',':
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
if (currentElement.type == '\'') {
|
if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = ',';
|
curretElement.type = ',';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = null;
|
curretElement = null;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
if (currentElement == null) {
|
if (curretElement == null) {
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = 'F';
|
curretElement.type = 'F';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
|
|
||||||
} else if (currentElement.type == 'F') {
|
} else if (curretElement.type == 'F') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
|
|
||||||
} else if (currentElement.type == '\'') {
|
} else if (curretElement.type == '\'') {
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
currentElement = new Element();
|
curretElement = new Element();
|
||||||
currentElement.type = 'F';
|
curretElement.type = 'F';
|
||||||
currentElement.append(expChar);
|
curretElement.append(expChar);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (currentElement != null) {
|
if (curretElement != null) {
|
||||||
this.elements.add(currentElement);
|
this.elements.add(curretElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -359,7 +359,7 @@ public class IKQueryExpressionParser {
|
|||||||
throw new IllegalStateException("表达式异常: = 或 : 号丢失");
|
throw new IllegalStateException("表达式异常: = 或 : 号丢失");
|
||||||
}
|
}
|
||||||
Element e3 = this.elements.get(i + 2);
|
Element e3 = this.elements.get(i + 2);
|
||||||
// 处理 = 和 : 运算
|
//处理 = 和 : 运算
|
||||||
if ('\'' == e3.type) {
|
if ('\'' == e3.type) {
|
||||||
i += 2;
|
i += 2;
|
||||||
if ('=' == e2.type) {
|
if ('=' == e2.type) {
|
||||||
@ -367,14 +367,14 @@ public class IKQueryExpressionParser {
|
|||||||
this.querys.push(tQuery);
|
this.querys.push(tQuery);
|
||||||
} else {
|
} else {
|
||||||
String keyword = e3.toString();
|
String keyword = e3.toString();
|
||||||
// SWMCQuery Here
|
//SWMCQuery Here
|
||||||
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword);
|
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword);
|
||||||
this.querys.push(_SWMCQuery);
|
this.querys.push(_SWMCQuery);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if ('[' == e3.type || '{' == e3.type) {
|
} else if ('[' == e3.type || '{' == e3.type) {
|
||||||
i += 2;
|
i += 2;
|
||||||
// 处理 [] 和 {}
|
//处理 [] 和 {}
|
||||||
LinkedList<Element> eQueue = new LinkedList<>();
|
LinkedList<Element> eQueue = new LinkedList<>();
|
||||||
eQueue.add(e3);
|
eQueue.add(e3);
|
||||||
for (i++; i < this.elements.size(); i++) {
|
for (i++; i < this.elements.size(); i++) {
|
||||||
@ -384,7 +384,7 @@ public class IKQueryExpressionParser {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 翻译RangeQuery
|
//翻译RangeQuery
|
||||||
Query rangeQuery = this.toTermRangeQuery(e, eQueue);
|
Query rangeQuery = this.toTermRangeQuery(e, eQueue);
|
||||||
this.querys.push(rangeQuery);
|
this.querys.push(rangeQuery);
|
||||||
} else {
|
} else {
|
||||||
@ -475,10 +475,10 @@ public class IKQueryExpressionParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// q1 instanceof TermQuery
|
//q1 instanceof TermQuery
|
||||||
// q1 instanceof TermRangeQuery
|
//q1 instanceof TermRangeQuery
|
||||||
// q1 instanceof PhraseQuery
|
//q1 instanceof PhraseQuery
|
||||||
// others
|
//others
|
||||||
resultQuery.add(q1, Occur.MUST);
|
resultQuery.add(q1, Occur.MUST);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -496,10 +496,10 @@ public class IKQueryExpressionParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// q1 instanceof TermQuery
|
//q1 instanceof TermQuery
|
||||||
// q1 instanceof TermRangeQuery
|
//q1 instanceof TermRangeQuery
|
||||||
// q1 instanceof PhraseQuery
|
//q1 instanceof PhraseQuery
|
||||||
// others
|
//others
|
||||||
resultQuery.add(q2, Occur.MUST);
|
resultQuery.add(q2, Occur.MUST);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -518,10 +518,10 @@ public class IKQueryExpressionParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// q1 instanceof TermQuery
|
//q1 instanceof TermQuery
|
||||||
// q1 instanceof TermRangeQuery
|
//q1 instanceof TermRangeQuery
|
||||||
// q1 instanceof PhraseQuery
|
//q1 instanceof PhraseQuery
|
||||||
// others
|
//others
|
||||||
resultQuery.add(q1, Occur.SHOULD);
|
resultQuery.add(q1, Occur.SHOULD);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -538,10 +538,10 @@ public class IKQueryExpressionParser {
|
|||||||
resultQuery.add(q2, Occur.SHOULD);
|
resultQuery.add(q2, Occur.SHOULD);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// q2 instanceof TermQuery
|
//q2 instanceof TermQuery
|
||||||
// q2 instanceof TermRangeQuery
|
//q2 instanceof TermRangeQuery
|
||||||
// q2 instanceof PhraseQuery
|
//q2 instanceof PhraseQuery
|
||||||
// others
|
//others
|
||||||
resultQuery.add(q2, Occur.SHOULD);
|
resultQuery.add(q2, Occur.SHOULD);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -563,10 +563,10 @@ public class IKQueryExpressionParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// q1 instanceof TermQuery
|
//q1 instanceof TermQuery
|
||||||
// q1 instanceof TermRangeQuery
|
//q1 instanceof TermRangeQuery
|
||||||
// q1 instanceof PhraseQuery
|
//q1 instanceof PhraseQuery
|
||||||
// others
|
//others
|
||||||
resultQuery.add(q1, Occur.MUST);
|
resultQuery.add(q1, Occur.MUST);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -584,7 +584,7 @@ public class IKQueryExpressionParser {
|
|||||||
boolean includeLast;
|
boolean includeLast;
|
||||||
String firstValue;
|
String firstValue;
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
// 检查第一个元素是否是[或者{
|
//检查第一个元素是否是[或者{
|
||||||
Element first = elements.getFirst();
|
Element first = elements.getFirst();
|
||||||
if ('[' == first.type) {
|
if ('[' == first.type) {
|
||||||
includeFirst = true;
|
includeFirst = true;
|
||||||
@ -593,7 +593,7 @@ public class IKQueryExpressionParser {
|
|||||||
} else {
|
} else {
|
||||||
throw new IllegalStateException("表达式异常");
|
throw new IllegalStateException("表达式异常");
|
||||||
}
|
}
|
||||||
// 检查最后一个元素是否是]或者}
|
//检查最后一个元素是否是]或者}
|
||||||
Element last = elements.getLast();
|
Element last = elements.getLast();
|
||||||
if (']' == last.type) {
|
if (']' == last.type) {
|
||||||
includeLast = true;
|
includeLast = true;
|
||||||
@ -605,7 +605,7 @@ public class IKQueryExpressionParser {
|
|||||||
if (elements.size() < 4 || elements.size() > 5) {
|
if (elements.size() < 4 || elements.size() > 5) {
|
||||||
throw new IllegalStateException("表达式异常, RangeQuery 错误");
|
throw new IllegalStateException("表达式异常, RangeQuery 错误");
|
||||||
}
|
}
|
||||||
// 读出中间部分
|
//读出中间部分
|
||||||
Element e2 = elements.get(1);
|
Element e2 = elements.get(1);
|
||||||
if ('\'' == e2.type) {
|
if ('\'' == e2.type) {
|
||||||
firstValue = e2.toString();
|
firstValue = e2.toString();
|
||||||
@ -673,7 +673,7 @@ public class IKQueryExpressionParser {
|
|||||||
* @author linliangyi
|
* @author linliangyi
|
||||||
* May 20, 2010
|
* May 20, 2010
|
||||||
*/
|
*/
|
||||||
private static class Element {
|
private class Element {
|
||||||
char type = 0;
|
char type = 0;
|
||||||
StringBuffer eleTextBuff;
|
StringBuffer eleTextBuff;
|
||||||
|
|
||||||
@ -692,9 +692,11 @@ public class IKQueryExpressionParser {
|
|||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
IKQueryExpressionParser parser = new IKQueryExpressionParser();
|
IKQueryExpressionParser parser = new IKQueryExpressionParser();
|
||||||
|
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
|
||||||
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
|
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
|
||||||
Query result = parser.parseExp(ikQueryExp);
|
Query result = parser.parseExp(ikQueryExp);
|
||||||
System.out.println(result);
|
System.out.println(result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -45,7 +45,6 @@ import java.util.List;
|
|||||||
*
|
*
|
||||||
* @author linliangyi
|
* @author linliangyi
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("unused")
|
|
||||||
class SWMCQueryBuilder {
|
class SWMCQueryBuilder {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -57,9 +56,9 @@ class SWMCQueryBuilder {
|
|||||||
if (fieldName == null || keywords == null) {
|
if (fieldName == null || keywords == null) {
|
||||||
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
|
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
|
||||||
}
|
}
|
||||||
// 1.对keywords进行分词处理
|
//1.对keywords进行分词处理
|
||||||
List<Lexeme> lexemes = doAnalyze(keywords);
|
List<Lexeme> lexemes = doAnalyze(keywords);
|
||||||
// 2.根据分词结果,生成SWMCQuery
|
//2.根据分词结果,生成SWMCQuery
|
||||||
return getSWMCQuery(fieldName, lexemes);
|
return getSWMCQuery(fieldName, lexemes);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,20 +84,20 @@ class SWMCQueryBuilder {
|
|||||||
* 根据分词结果生成SWMC搜索
|
* 根据分词结果生成SWMC搜索
|
||||||
*/
|
*/
|
||||||
private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) {
|
private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) {
|
||||||
// 构造SWMC的查询表达式
|
//构造SWMC的查询表达式
|
||||||
StringBuilder keywordBuffer = new StringBuilder();
|
StringBuilder keywordBuffer = new StringBuilder();
|
||||||
// 精简的SWMC的查询表达式
|
//精简的SWMC的查询表达式
|
||||||
StringBuilder keywordBuffer_Short = new StringBuilder();
|
StringBuilder keywordBuffer_Short = new StringBuilder();
|
||||||
// 记录最后词元长度
|
//记录最后词元长度
|
||||||
int lastLexemeLength = 0;
|
int lastLexemeLength = 0;
|
||||||
// 记录最后词元结束位置
|
//记录最后词元结束位置
|
||||||
int lastLexemeEnd = -1;
|
int lastLexemeEnd = -1;
|
||||||
|
|
||||||
int shortCount = 0;
|
int shortCount = 0;
|
||||||
int totalCount = 0;
|
int totalCount = 0;
|
||||||
for (Lexeme l : lexemes) {
|
for (Lexeme l : lexemes) {
|
||||||
totalCount += l.getLength();
|
totalCount += l.getLength();
|
||||||
// 精简表达式
|
//精简表达式
|
||||||
if (l.getLength() > 1) {
|
if (l.getLength() > 1) {
|
||||||
keywordBuffer_Short.append(' ').append(l.getLexemeText());
|
keywordBuffer_Short.append(' ').append(l.getLexemeText());
|
||||||
shortCount += l.getLength();
|
shortCount += l.getLength();
|
||||||
@ -107,7 +106,7 @@ class SWMCQueryBuilder {
|
|||||||
if (lastLexemeLength == 0) {
|
if (lastLexemeLength == 0) {
|
||||||
keywordBuffer.append(l.getLexemeText());
|
keywordBuffer.append(l.getLexemeText());
|
||||||
} else if (lastLexemeLength == 1 && l.getLength() == 1
|
} else if (lastLexemeLength == 1 && l.getLength() == 1
|
||||||
&& lastLexemeEnd == l.getBeginPosition()) {// 单字位置相邻,长度为一,合并)
|
&& lastLexemeEnd == l.getBeginPosition()) {//单字位置相邻,长度为一,合并)
|
||||||
keywordBuffer.append(l.getLexemeText());
|
keywordBuffer.append(l.getLexemeText());
|
||||||
} else {
|
} else {
|
||||||
keywordBuffer.append(' ').append(l.getLexemeText());
|
keywordBuffer.append(' ').append(l.getLexemeText());
|
||||||
@ -117,10 +116,10 @@ class SWMCQueryBuilder {
|
|||||||
lastLexemeEnd = l.getEndPosition();
|
lastLexemeEnd = l.getEndPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 借助lucene queryparser 生成SWMC Query
|
//借助lucene queryparser 生成SWMC Query
|
||||||
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
|
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
|
||||||
qp.setAutoGeneratePhraseQueries(false);
|
|
||||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||||
|
qp.setAutoGeneratePhraseQueries(true);
|
||||||
|
|
||||||
if ((shortCount * 1.0f / totalCount) > 0.5f) {
|
if ((shortCount * 1.0f / totalCount) > 0.5f) {
|
||||||
try {
|
try {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user