Compare commits

..

24 Commits

Author SHA1 Message Date
Magese
f0059e2c78 Resolve QueryParser exception. 2022-01-04 10:36:49 +08:00
Magese
fb4defedb7 Format comments. 2022-01-04 09:59:25 +08:00
Magese
9ed56a0f41 Format comments. 2022-01-04 09:56:41 +08:00
Magese
70d40bc3af Using final. 2022-01-04 09:51:21 +08:00
Magese
ef2dbeb979 Format comments. 2022-01-04 09:48:28 +08:00
Magese
872aac8298 Using final and volatile. 2022-01-04 09:44:12 +08:00
Magese
052e8f476e Package sorting. 2022-01-04 09:36:59 +08:00
Magese
6cc121e98d Remove HitCount Badges. 2021-12-31 17:56:33 +08:00
Magese
32a9369680 注释格式化; 2021-12-31 17:51:35 +08:00
Magese
dd7822b6be 优化排序算法逻辑; 2021-12-31 17:48:21 +08:00
Magese
6ef4798752 代码及注释格式化; 2021-12-31 17:43:40 +08:00
Magese
56f23a9027 注释格式化; 2021-12-31 17:38:43 +08:00
Magese
5ab517079b 注释格式化; 2021-12-31 17:36:34 +08:00
Magese
020f83e665 构造初始化成员变量声明为 final; 2021-12-31 17:35:32 +08:00
Magese
ab50f161e6 注释格式化; 2021-12-31 17:33:07 +08:00
Magese
47439fa94b 注释格式化; 2021-12-31 17:31:28 +08:00
Magese
c938bf1f2b 成员变量申明为 final,逻辑判断优化; 2021-12-31 17:29:59 +08:00
Magese
92cb2a28d6 格式化代码、注释; 2021-12-31 17:27:42 +08:00
Magese
f173925dc0 优化词前置判断逻辑; 2021-12-31 17:17:47 +08:00
Magese
f9bc7a12fa 代码、注释格式化; 2021-12-31 17:14:07 +08:00
Magese
df29bdc4df 代码格式化; 2021-12-31 17:11:52 +08:00
Magese
3ec8076730 日志格式优化; 2021-12-31 17:10:19 +08:00
Magese
7149c54de7 单例对象使用 volatile 关键字; 2021-12-31 17:02:02 +08:00
Magese
fff131a45a 修复单词拼写错误; 2021-12-31 16:59:38 +08:00
22 changed files with 999 additions and 919 deletions

View File

@ -6,7 +6,6 @@ ik-analyzer for solr 7.x-8.x
[![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://github.com/magese/ik-analyzer-solr/releases)
[![Crates.io](https://img.shields.io/crates/l/rustc-serialize.svg)](./LICENSE)
[![Build Status](https://travis-ci.org/magese/ik-analyzer-solr.svg?branch=master)](https://travis-ci.org/magese/ik-analyzer-solr)
[![HitCount](http://hits.dwyl.io/magese/ik-analyzer-solr.svg)](http://hits.dwyl.io/magese/ik-analyzer-solr)
[![GitHub forks](https://img.shields.io/github/forks/magese/ik-analyzer-solr.svg?style=social&label=Fork)](https://github.com/magese/ik-analyzer-solr/network/members)
[![GitHub stars](https://img.shields.io/github/stars/magese/ik-analyzer-solr.svg?style=social&label=Star)](https://github.com/magese/ik-analyzer-solr/stargazers)

View File

@ -76,7 +76,7 @@ public interface Configuration {
*
* @return String 量词词典路径
*/
String getQuantifierDicionary();
String getQuantifierDictionary();
/**
* 获取扩展字典配置路径

View File

@ -145,7 +145,7 @@ public class DefaultConfig implements Configuration {
*
* @return String 量词词典路径
*/
public String getQuantifierDicionary() {
public String getQuantifierDictionary() {
return PATH_DIC_QUANTIFIER;
}

View File

@ -39,39 +39,39 @@ import java.util.*;
*/
class AnalyzeContext {
//默认缓冲区大小
// 默认缓冲区大小
private static final int BUFF_SIZE = 4096;
//缓冲区耗尽的临界值
// 缓冲区耗尽的临界值
private static final int BUFF_EXHAUST_CRITICAL = 100;
//字符窜读取缓冲
// 字符窜读取缓冲
private char[] segmentBuff;
//字符类型数组
// 字符类型数组
private int[] charTypes;
//记录Reader内已分析的字串总长度
//在分多段分析词元时该变量累计当前的segmentBuff相对于reader起始位置的位移
// 记录Reader内已分析的字串总长度
// 在分多段分析词元时该变量累计当前的segmentBuff相对于reader起始位置的位移
private int buffOffset;
//当前缓冲区位置指针
// 当前缓冲区位置指针
private int cursor;
//最近一次读入的,可处理的字串长度
// 最近一次读入的,可处理的字串长度
private int available;
//子分词器锁
//该集合非空说明有子分词器在占用segmentBuff
// 子分词器锁
// 该集合非空说明有子分词器在占用segmentBuff
private final Set<String> buffLocker;
//原始分词结果集合未经歧义处理
// 原始分词结果集合未经歧义处理
private QuickSortSet orgLexemes;
//LexemePath位置索引表
// LexemePath位置索引表
private final Map<Integer, LexemePath> pathMap;
//最终分词结果集
// 最终分词结果集
private final LinkedList<Lexeme> results;
//分词器配置项
// 分词器配置项
private final Configuration cfg;
AnalyzeContext(Configuration cfg) {
@ -113,21 +113,21 @@ class AnalyzeContext {
int fillBuffer(Reader reader) throws IOException {
int readCount = 0;
if (this.buffOffset == 0) {
//首次读取reader
// 首次读取reader
readCount = reader.read(segmentBuff);
} else {
int offset = this.available - this.cursor;
if (offset > 0) {
//最近一次读取的>最近一次处理的将未处理的字串拷贝到segmentBuff头部
// 最近一次读取的>最近一次处理的将未处理的字串拷贝到segmentBuff头部
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
readCount = offset;
}
//继续读取reader 以onceReadIn - onceAnalyzed为起始位置继续填充segmentBuff剩余的部分
// 继续读取reader 以onceReadIn - onceAnalyzed为起始位置继续填充segmentBuff剩余的部分
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
}
//记录最后一次从Reader中读入的可用字符长度
// 记录最后一次从Reader中读入的可用字符长度
this.available = readCount;
//重置当前指针
// 重置当前指针
this.cursor = 0;
return readCount;
}
@ -251,35 +251,35 @@ class AnalyzeContext {
void outputToResult() {
int index = 0;
while (index <= this.cursor) {
//跳过非CJK字符
// 跳过非CJK字符
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
index++;
continue;
}
//从pathMap找出对应index位置的LexemePath
// 从pathMap找出对应index位置的LexemePath
LexemePath path = this.pathMap.get(index);
if (path != null) {
//输出LexemePath中的lexeme到results集合
// 输出LexemePath中的lexeme到results集合
Lexeme l = path.pollFirst();
while (l != null) {
this.results.add(l);
//将index移至lexeme后
// 将index移至lexeme后
index = l.getBegin() + l.getLength();
l = path.pollFirst();
if (l != null) {
//输出path内部词元间遗漏的单字
// 输出path内部词元间遗漏的单字
for (; index < l.getBegin(); index++) {
this.outputSingleCJK(index);
}
}
}
} else {//pathMap中找不到index对应的LexemePath
//单字输出
} else {// pathMap中找不到index对应的LexemePath
// 单字输出
this.outputSingleCJK(index);
index++;
}
}
//清空当前的Map
// 清空当前的Map
this.pathMap.clear();
}
@ -304,16 +304,16 @@ class AnalyzeContext {
* 同时处理合并
*/
Lexeme getNextLexeme() {
//从结果集取出并移除第一个Lexme
// 从结果集取出并移除第一个Lexme
Lexeme result = this.results.pollFirst();
while (result != null) {
//数量词合并
// 数量词合并
this.compound(result);
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
//是停止词继续取列表的下一个
// 是停止词继续取列表的下一个
result = this.results.pollFirst();
} else {
//不是停止词, 生成lexeme的词元文本,输出
// 不是停止词, 生成lexeme的词元文本,输出
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
break;
}
@ -343,7 +343,7 @@ class AnalyzeContext {
if (!this.cfg.useSmart()) {
return;
}
//数量词合并处理
// 数量词合并处理
if (!this.results.isEmpty()) {
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
@ -351,29 +351,29 @@ class AnalyzeContext {
boolean appendOk = false;
if (nextLexeme != null) {
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
//合并英文数词+中文数词
// 合并英文数词+中文数词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
//合并英文数词+中文量词
// 合并英文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
}
if (appendOk) {
//弹出
// 弹出
this.results.pollFirst();
}
}
//可能存在第二轮合并
// 可能存在第二轮合并
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
//合并中文数词+中文量词
// 合并中文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
if (appendOk) {
//弹出
// 弹出
this.results.pollFirst();
}
}

View File

@ -27,102 +27,101 @@
*/
package org.wltea.analyzer.core;
import java.util.LinkedList;
import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList;
import java.util.List;
/**
* 中文-日韩文子分词器
* 中文-日韩文子分词器
*/
class CJKSegmenter implements ISegmenter {
//子分词器标签
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
//待处理的分词hit队列
private List<Hit> tmpHits;
// 子分词器标签
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
// 待处理的分词hit队列
private final List<Hit> tmpHits;
CJKSegmenter(){
this.tmpHits = new LinkedList<>();
}
CJKSegmenter() {
this.tmpHits = new LinkedList<>();
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
*/
public void analyze(AnalyzeContext context) {
if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
*/
public void analyze(AnalyzeContext context) {
if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
//优先处理tmpHits中的hit
if(!this.tmpHits.isEmpty()){
//处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[0]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
// 优先处理tmpHits中的hit
if (!this.tmpHits.isEmpty()) {
// 处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[0]);
for (Hit hit : tmpArray) {
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
if (hit.isMatch()) {
// 输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.tmpHits.remove(hit);
}
if (!hit.isPrefix()) {// 不是词前缀hit不需要继续匹配移除
this.tmpHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.tmpHits.remove(hit);
}
}
}
} else if (hit.isUnmatch()) {
// hit不是词移除
this.tmpHits.remove(hit);
}
}
}
//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
// *********************************
// 再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为词前缀
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
// 首字为词前缀
if (singleCharHit.isMatch()) {
// 输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
}
// 前缀匹配则放入hit列表
if (singleCharHit.isPrefix()) {
// 前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else{
//遇到CHAR_USELESS字符
//清空队列
this.tmpHits.clear();
}
} else {
// 遇到CHAR_USELESS字符
// 清空队列
this.tmpHits.clear();
}
//判断缓冲区是否已经读完
if(context.isBufferConsumed()){
//清空队列
this.tmpHits.clear();
}
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
// 清空队列
this.tmpHits.clear();
}
//判断是否锁定缓冲区
if(this.tmpHits.size() == 0){
context.unlockBuffer(SEGMENTER_NAME);
// 判断是否锁定缓冲区
if (this.tmpHits.size() == 0) {
context.unlockBuffer(SEGMENTER_NAME);
}else{
context.lockBuffer(SEGMENTER_NAME);
}
}
} else {
context.lockBuffer(SEGMENTER_NAME);
}
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#reset()
*/
public void reset() {
//清空队列
this.tmpHits.clear();
}
/* (non-Javadoc)
* @see org.wltea.analyzer.core.ISegmenter#reset()
*/
public void reset() {
// 清空队列
this.tmpHits.clear();
}
}

View File

@ -36,206 +36,205 @@ import java.util.List;
import java.util.Set;
/**
*
* 中文数量词子分词器
*/
class CN_QuantifierSegmenter implements ISegmenter{
class CN_QuantifierSegmenter implements ISegmenter {
//子分词器标签
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
// 子分词器标签
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
private static Set<Character> ChnNumberChars = new HashSet<>();
static{
//中文数词
//Cnum
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
char[] ca = chn_Num.toCharArray();
for(char nChar : ca){
ChnNumberChars.add(nChar);
}
}
private static final Set<Character> CHN_NUMBER_CHARS = new HashSet<>();
/*
* 词元的开始位置
* 同时作为子分词器状态标识
* 当start > -1 标识当前的分词器正在处理字符
*/
private int nStart;
/*
* 记录词元结束位置
* end记录的是在词元中最后一个出现的合理的数词结束
*/
private int nEnd;
static {
// 中文数词
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
char[] ca = chn_Num.toCharArray();
for (char nChar : ca) {
CHN_NUMBER_CHARS.add(nChar);
}
}
//待处理的量词hit队列
private List<Hit> countHits;
/*
* 词元的开始位置
* 同时作为子分词器状态标识
* 当start > -1 标识当前的分词器正在处理字符
*/
private int nStart;
/*
* 记录词元结束位置
* end记录的是在词元中最后一个出现的合理的数词结束
*/
private int nEnd;
// 待处理的量词hit队列
private final List<Hit> countHits;
CN_QuantifierSegmenter(){
nStart = -1;
nEnd = -1;
this.countHits = new LinkedList<>();
}
CN_QuantifierSegmenter() {
nStart = -1;
nEnd = -1;
this.countHits = new LinkedList<>();
}
/**
* 分词
*/
public void analyze(AnalyzeContext context) {
//处理中文数词
this.processCNumber(context);
//处理中文量词
this.processCount(context);
/**
* 分词
*/
public void analyze(AnalyzeContext context) {
// 处理中文数词
this.processCNumber(context);
// 处理中文量词
this.processCount(context);
//判断是否锁定缓冲区
if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
//对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
}else{
context.lockBuffer(SEGMENTER_NAME);
}
}
// 判断是否锁定缓冲区
if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
// 对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
} else {
context.lockBuffer(SEGMENTER_NAME);
}
}
/**
* 重置子分词器状态
*/
public void reset() {
nStart = -1;
nEnd = -1;
countHits.clear();
}
/**
* 重置子分词器状态
*/
public void reset() {
nStart = -1;
nEnd = -1;
countHits.clear();
}
/**
* 处理数词
*/
private void processCNumber(AnalyzeContext context){
if(nStart == -1 && nEnd == -1){//初始状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的起始结束位置
nStart = context.getCursor();
nEnd = context.getCursor();
}
}else{//正在处理状态
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
//记录数词的结束位置
nEnd = context.getCursor();
}else{
//输出数词
this.outputNumLexeme(context);
//重置头尾指针
nStart = -1;
nEnd = -1;
}
}
/**
* 处理数词
*/
private void processCNumber(AnalyzeContext context) {
if (nStart == -1 && nEnd == -1) {// 初始状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
// 记录数词的起始结束位置
nStart = context.getCursor();
nEnd = context.getCursor();
}
} else {// 正在处理状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
// 记录数词的结束位置
nEnd = context.getCursor();
} else {
// 输出数词
this.outputNumLexeme(context);
// 重置头尾指针
nStart = -1;
nEnd = -1;
}
}
//缓冲区已经用完还有尚未输出的数词
if(context.isBufferConsumed()){
if(nStart != -1 && nEnd != -1){
//输出数词
outputNumLexeme(context);
//重置头尾指针
nStart = -1;
nEnd = -1;
}
}
}
// 缓冲区已经用完还有尚未输出的数词
if (context.isBufferConsumed()) {
if (nStart != -1 && nEnd != -1) {
// 输出数词
outputNumLexeme(context);
// 重置头尾指针
nStart = -1;
nEnd = -1;
}
}
}
/**
* 处理中文量词
* @param context 需要处理的内容
*/
private void processCount(AnalyzeContext context){
// 判断是否需要启动量词扫描
if(!this.needCountScan(context)){
return;
}
/**
* 处理中文量词
*
* @param context 需要处理的内容
*/
private void processCount(AnalyzeContext context) {
// 判断是否需要启动量词扫描
if (!this.needCountScan(context)) {
return;
}
if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
//优先处理countHits中的hit
if(!this.countHits.isEmpty()){
//处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[0]);
for(Hit hit : tmpArray){
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
// 优先处理countHits中的hit
if (!this.countHits.isEmpty()) {
// 处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[0]);
for (Hit hit : tmpArray) {
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
if (hit.isMatch()) {
// 输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
if(!hit.isPrefix()){//不是词前缀hit不需要继续匹配移除
this.countHits.remove(hit);
}
if (!hit.isPrefix()) {// 不是词前缀hit不需要继续匹配移除
this.countHits.remove(hit);
}
}else if(hit.isUnmatch()){
//hit不是词移除
this.countHits.remove(hit);
}
}
}
} else if (hit.isUnmatch()) {
// hit不是词移除
this.countHits.remove(hit);
}
}
}
//*********************************
//对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
// *********************************
// 对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为量词前缀
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
// 首字为量词前缀
if (singleCharHit.isMatch()) {
// 输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
}
// 前缀匹配则放入hit列表
if (singleCharHit.isPrefix()) {
// 前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else{
//输入的不是中文字符
//清空未成形的量词
this.countHits.clear();
}
} else {
// 输入的不是中文字符
// 清空未成形的量词
this.countHits.clear();
}
//缓冲区数据已经读完还有尚未输出的量词
if(context.isBufferConsumed()){
//清空未成形的量词
this.countHits.clear();
}
}
// 缓冲区数据已经读完还有尚未输出的量词
if (context.isBufferConsumed()) {
// 清空未成形的量词
this.countHits.clear();
}
}
/**
* 判断是否需要扫描量词
*/
private boolean needCountScan(AnalyzeContext context){
if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
//正在处理中文数词,或者正在处理量词
return true;
}else{
//找到一个相邻的数词
if(!context.getOrgLexemes().isEmpty()){
Lexeme l = context.getOrgLexemes().peekLast();
if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
return l.getBegin() + l.getLength() == context.getCursor();
}
}
}
return false;
}
/**
* 判断是否需要扫描量词
*/
private boolean needCountScan(AnalyzeContext context) {
if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
// 正在处理中文数词,或者正在处理量词
return true;
} else {
// 找到一个相邻的数词
if (!context.getOrgLexemes().isEmpty()) {
Lexeme l = context.getOrgLexemes().peekLast();
if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
return l.getBegin() + l.getLength() == context.getCursor();
}
}
}
return false;
}
/**
* 添加数词词元到结果集
* @param context 需要添加的词元
*/
private void outputNumLexeme(AnalyzeContext context){
if(nStart > -1 && nEnd > -1){
//输出数词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
context.addLexeme(newLexeme);
}
}
/**
* 添加数词词元到结果集
*
* @param context 需要添加的词元
*/
private void outputNumLexeme(AnalyzeContext context) {
if (nStart > -1 && nEnd > -1) {
// 输出数词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM);
context.addLexeme(newLexeme);
}
}
}

View File

@ -28,77 +28,78 @@
package org.wltea.analyzer.core;
/**
*
* 字符集识别工具类
*/
class CharacterUtil {
static final int CHAR_USELESS = 0;
static final int CHAR_USELESS = 0;
static final int CHAR_ARABIC = 0X00000001;
static final int CHAR_ARABIC = 0X00000001;
static final int CHAR_ENGLISH = 0X00000002;
static final int CHAR_ENGLISH = 0X00000002;
static final int CHAR_CHINESE = 0X00000004;
static final int CHAR_CHINESE = 0X00000004;
static final int CHAR_OTHER_CJK = 0X00000008;
static final int CHAR_OTHER_CJK = 0X00000008;
/**
* 识别字符类型
* @param input 需要识别的字符
* @return int CharacterUtil定义的字符类型常量
*/
static int identifyCharType(char input){
if(input >= '0' && input <= '9'){
return CHAR_ARABIC;
/**
* 识别字符类型
*
* @param input 需要识别的字符
* @return int CharacterUtil定义的字符类型常量
*/
static int identifyCharType(char input) {
if (input >= '0' && input <= '9') {
return CHAR_ARABIC;
}else if((input >= 'a' && input <= 'z')
|| (input >= 'A' && input <= 'Z')){
return CHAR_ENGLISH;
} else if ((input >= 'a' && input <= 'z')
|| (input >= 'A' && input <= 'Z')) {
return CHAR_ENGLISH;
}else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
} else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
//目前已知的中文字符UTF-8集合
return CHAR_CHINESE;
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
//目前已知的中文字符UTF-8集合
return CHAR_CHINESE;
}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
//韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
//日文字符集
|| ub == Character.UnicodeBlock.HIRAGANA //平假名
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
return CHAR_OTHER_CJK;
} else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
//韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
//日文字符集
|| ub == Character.UnicodeBlock.HIRAGANA //平假名
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
return CHAR_OTHER_CJK;
}
}
//其他的不做处理的字符
return CHAR_USELESS;
}
}
}
//其他的不做处理的字符
return CHAR_USELESS;
}
/**
* 进行字符规格化全角转半角大写转小写处理
* @param input 需要转换的字符
* @return char
*/
static char regularize(char input){
/**
* 进行字符规格化全角转半角大写转小写处理
*
* @param input 需要转换的字符
* @return char
*/
static char regularize(char input) {
if (input == 12288) {
input = (char) 32;
}else if (input > 65280 && input < 65375) {
} else if (input > 65280 && input < 65375) {
input = (char) (input - 65248);
}else if (input >= 'A' && input <= 'Z') {
input += 32;
}
} else if (input >= 'A' && input <= 'Z') {
input += 32;
}
return input;
}
}
}

View File

@ -35,9 +35,7 @@ import java.util.TreeSet;
*/
class IKArbitrator {
IKArbitrator() {
}
IKArbitrator() {}
/**
* 分词歧义处理
@ -52,20 +50,20 @@ class IKArbitrator {
LexemePath crossPath = new LexemePath();
while (orgLexeme != null) {
if (!crossPath.addCrossLexeme(orgLexeme)) {
//找到与crossPath不相交的下一个crossPath
// 找到与crossPath不相交的下一个crossPath
if (crossPath.size() == 1 || !useSmart) {
//crossPath没有歧义 或者 不做歧义处理
//直接输出当前crossPath
// crossPath没有歧义 或者 不做歧义处理
// 直接输出当前crossPath
context.addLexemePath(crossPath);
} else {
//对当前的crossPath进行歧义处理
// 对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell);
//输出歧义处理结果judgeResult
// 输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
//把orgLexeme加入新的crossPath中
// 把orgLexeme加入新的crossPath中
crossPath = new LexemePath();
crossPath.addCrossLexeme(orgLexeme);
}
@ -73,16 +71,16 @@ class IKArbitrator {
}
//处理最后的path
// 处理最后的path
if (crossPath.size() == 1 || !useSmart) {
//crossPath没有歧义 或者 不做歧义处理
//直接输出当前crossPath
// crossPath没有歧义 或者 不做歧义处理
// 直接输出当前crossPath
context.addLexemePath(crossPath);
} else {
//对当前的crossPath进行歧义处理
// 对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell);
//输出歧义处理结果judgeResult
// 输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
}
@ -93,29 +91,29 @@ class IKArbitrator {
* @param lexemeCell 歧义路径链表头
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell) {
//候选路径集合
// 候选路径集合
TreeSet<LexemePath> pathOptions = new TreeSet<>();
//候选结果路径
// 候选结果路径
LexemePath option = new LexemePath();
//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
// 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell, option);
//当前词元链并非最理想的加入候选路径集合
// 当前词元链并非最理想的加入候选路径集合
pathOptions.add(option.copy());
//存在歧义词处理
// 存在歧义词处理
QuickSortSet.Cell c;
while (!lexemeStack.isEmpty()) {
c = lexemeStack.pop();
//回滚词元链
// 回滚词元链
this.backPath(c.getLexeme(), option);
//从歧义词位置开始递归生成可选方案
// 从歧义词位置开始递归生成可选方案
this.forwardPath(c, option);
pathOptions.add(option.copy());
}
//返回集合中的最优方案
// 返回集合中的最优方案
return pathOptions.first();
}
@ -124,13 +122,13 @@ class IKArbitrator {
* 向前遍历添加词元构造一个无歧义词元组合
*/
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
//发生冲突的Lexeme栈
// 发生冲突的Lexeme栈
Stack<QuickSortSet.Cell> conflictStack = new Stack<>();
QuickSortSet.Cell c = lexemeCell;
//迭代遍历Lexeme链表
// 迭代遍历Lexeme链表
while (c != null && c.getLexeme() != null) {
if (!option.addNotCrossLexeme(c.getLexeme())) {
//词元交叉添加失败则加入lexemeStack栈
// 词元交叉添加失败则加入lexemeStack栈
conflictStack.push(c);
}
c = c.getNext();

View File

@ -41,15 +41,25 @@ import java.util.List;
*/
public final class IKSegmenter {
//字符窜reader
/**
* 字符窜reader
*/
private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文
/**
* 分词器配置项
*/
private final Configuration cfg;
/**
* 分词器上下文
*/
private AnalyzeContext context;
//分词处理器列表
/**
* 分词处理器列表
*/
private List<ISegmenter> segmenters;
//分词歧义裁决器
/**
* 分词歧义裁决器
*/
private IKArbitrator arbitrator;
@ -85,13 +95,13 @@ public final class IKSegmenter {
* 初始化
*/
private void init() {
//初始化词典单例
// 初始化词典单例
Dictionary.initial(this.cfg);
//初始化分词上下文
// 初始化分词上下文
this.context = new AnalyzeContext(this.cfg);
//加载子分词器
// 加载子分词器
this.segmenters = this.loadSegmenters();
//加载歧义裁决器
// 加载歧义裁决器
this.arbitrator = new IKArbitrator();
}
@ -102,11 +112,11 @@ public final class IKSegmenter {
*/
private List<ISegmenter> loadSegmenters() {
List<ISegmenter> segmenters = new ArrayList<>(4);
//处理字母的子分词器
// 处理字母的子分词器
segmenters.add(new LetterSegmenter());
//处理中文数量词的子分词器
// 处理中文数量词的子分词器
segmenters.add(new CN_QuantifierSegmenter());
//处理中文词的子分词器
// 处理中文词的子分词器
segmenters.add(new CJKSegmenter());
return segmenters;
}
@ -126,34 +136,34 @@ public final class IKSegmenter {
*/
int available = context.fillBuffer(this.input);
if (available <= 0) {
//reader已经读完
// reader已经读完
context.reset();
return null;
} else {
//初始化指针
// 初始化指针
context.initCursor();
do {
//遍历子分词器
// 遍历子分词器
for (ISegmenter segmenter : segmenters) {
segmenter.analyze(context);
}
//字符缓冲区接近读完需要读入新的字符
// 字符缓冲区接近读完需要读入新的字符
if (context.needRefillBuffer()) {
break;
}
//向前移动指针
// 向前移动指针
} while (context.moveCursor());
//重置子分词器为下轮循环进行初始化
// 重置子分词器为下轮循环进行初始化
for (ISegmenter segmenter : segmenters) {
segmenter.reset();
}
}
//对分词进行歧义处理
// 对分词进行歧义处理
this.arbitrator.process(context, this.cfg.useSmart());
//将分词结果输出到结果集并处理未切分的单个CJK字符
// 将分词结果输出到结果集并处理未切分的单个CJK字符
context.outputToResult();
//记录本次分词的缓冲区位移
// 记录本次分词的缓冲区位移
context.markBufferOffset();
}
return l;

View File

@ -29,21 +29,21 @@ package org.wltea.analyzer.core;
/**
*
* 子分词器接口
*/
interface ISegmenter {
/**
* 从分析器读取下一个可能分解的词元对象
* @param context 分词算法上下文
*/
void analyze(AnalyzeContext context);
/**
* 从分析器读取下一个可能分解的词元对象
*
* @param context 分词算法上下文
*/
void analyze(AnalyzeContext context);
/**
* 重置子分析器状态
*/
void reset();
/**
* 重置子分析器状态
*/
void reset();
}

View File

@ -34,14 +34,18 @@ import java.util.Arrays;
*/
class LetterSegmenter implements ISegmenter {
//子分词器标签
/**
* 子分词器标签
*/
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
//链接符号
/**
* 链接符号
*/
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
//数字符号
/**
* 数字符号
*/
private static final char[] Num_Connector = new char[]{',', '.'};
/*
* 词元的开始位置
* 同时作为子分词器状态标识
@ -53,22 +57,18 @@ class LetterSegmenter implements ISegmenter {
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
*/
private int end;
/*
* 字母起始位置
*/
private int englishStart;
/*
* 字母结束位置
*/
private int englishEnd;
/*
* 阿拉伯数字起始位置
*/
private int arabicStart;
/*
* 阿拉伯数字结束位置
*/
@ -91,18 +91,18 @@ class LetterSegmenter implements ISegmenter {
*/
public void analyze(AnalyzeContext context) {
boolean bufferLockFlag;
//处理英文字母
// 处理英文字母
bufferLockFlag = this.processEnglishLetter(context);
//处理阿拉伯字母
// 处理阿拉伯字母
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
//处理混合字母(这个要放最后处理可以通过QuickSortSet排除重复)
// 处理混合字母(这个要放最后处理可以通过QuickSortSet排除重复)
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
//判断是否锁定缓冲区
// 判断是否锁定缓冲区
if (bufferLockFlag) {
context.lockBuffer(SEGMENTER_NAME);
} else {
//对缓冲区解锁
// 对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
}
}
@ -128,26 +128,26 @@ class LetterSegmenter implements ISegmenter {
private boolean processMixLetter(AnalyzeContext context) {
boolean needLock;
if (this.start == -1) {//当前的分词器尚未开始处理字符
if (this.start == -1) {// 当前的分词器尚未开始处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录起始指针的位置,标明分词器进入处理状态
// 记录起始指针的位置,标明分词器进入处理状态
this.start = context.getCursor();
this.end = start;
}
} else {//当前的分词器正在处理字符
} else {// 当前的分词器正在处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录下可能的结束位置
// 记录下可能的结束位置
this.end = context.getCursor();
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isLetterConnector(context.getCurrentChar())) {
//记录下可能的结束位置
// 记录下可能的结束位置
this.end = context.getCursor();
} else {
//遇到非Letter字符输出词元
// 遇到非Letter字符输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
@ -155,10 +155,10 @@ class LetterSegmenter implements ISegmenter {
}
}
//判断缓冲区是否已经读完
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.start != -1 && this.end != -1) {
//缓冲以读完输出词元
// 缓冲以读完输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start + 1, Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
@ -166,7 +166,7 @@ class LetterSegmenter implements ISegmenter {
}
}
//判断是否锁定缓冲区
// 判断是否锁定缓冲区
needLock = this.start != -1 || this.end != -1;
return needLock;
}
@ -179,18 +179,18 @@ class LetterSegmenter implements ISegmenter {
private boolean processEnglishLetter(AnalyzeContext context) {
boolean needLock;
if (this.englishStart == -1) {//当前的分词器尚未开始处理英文字符
if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录起始指针的位置,标明分词器进入处理状态
// 记录起始指针的位置,标明分词器进入处理状态
this.englishStart = context.getCursor();
this.englishEnd = this.englishStart;
}
} else {//当前的分词器正在处理英文字符
} else {// 当前的分词器正在处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
//记录当前指针位置为结束位置
// 记录当前指针位置为结束位置
this.englishEnd = context.getCursor();
} else {
//遇到非English字符,输出词元
// 遇到非English字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
@ -198,10 +198,10 @@ class LetterSegmenter implements ISegmenter {
}
}
//判断缓冲区是否已经读完
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.englishStart != -1 && this.englishEnd != -1) {
//缓冲以读完输出词元
// 缓冲以读完输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
@ -209,7 +209,7 @@ class LetterSegmenter implements ISegmenter {
}
}
//判断是否锁定缓冲区
// 判断是否锁定缓冲区
needLock = this.englishStart != -1 || this.englishEnd != -1;
return needLock;
}
@ -222,21 +222,21 @@ class LetterSegmenter implements ISegmenter {
private boolean processArabicLetter(AnalyzeContext context) {
boolean needLock;
if (this.arabicStart == -1) {//当前的分词器尚未开始处理数字字符
if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
//记录起始指针的位置,标明分词器进入处理状态
// 记录起始指针的位置,标明分词器进入处理状态
this.arabicStart = context.getCursor();
this.arabicEnd = this.arabicStart;
}
} else {//当前的分词器正在处理数字字符
} else {// 当前的分词器正在处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
//记录当前指针位置为结束位置
// 记录当前指针位置为结束位置
this.arabicEnd = context.getCursor();
}/* else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isNumConnector(context.getCurrentChar())) {
//不输出数字但不标记结束
// 不输出数字但不标记结束
}*/ else {
////遇到非Arabic字符,输出词元
// //遇到非Arabic字符,输出词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
@ -244,10 +244,10 @@ class LetterSegmenter implements ISegmenter {
}
}
//判断缓冲区是否已经读完
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.arabicStart != -1 && this.arabicEnd != -1) {
//生成已切分的词元
// 生成已切分的词元
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
@ -255,7 +255,7 @@ class LetterSegmenter implements ISegmenter {
}
}
//判断是否锁定缓冲区
// 判断是否锁定缓冲区
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
return needLock;
}

View File

@ -31,242 +31,278 @@ package org.wltea.analyzer.core;
* IK词元对象
*/
@SuppressWarnings("unused")
public class Lexeme implements Comparable<Lexeme>{
//英文
static final int TYPE_ENGLISH = 1;
//数字
static final int TYPE_ARABIC = 2;
//英文数字混合
static final int TYPE_LETTER = 3;
//中文词元
static final int TYPE_CNWORD = 4;
//中文单字
static final int TYPE_CNCHAR = 64;
//日韩文字
static final int TYPE_OTHER_CJK = 8;
//中文数词
static final int TYPE_CNUM = 16;
//中文量词
static final int TYPE_COUNT = 32;
//中文数量词
static final int TYPE_CQUAN = 48;
//词元的起始位移
private int offset;
//词元的相对起始位置
public class Lexeme implements Comparable<Lexeme> {
/**
* 英文
*/
static final int TYPE_ENGLISH = 1;
/**
* 数字
*/
static final int TYPE_ARABIC = 2;
/**
* 英文数字混合
*/
static final int TYPE_LETTER = 3;
/**
* 中文词元
*/
static final int TYPE_CNWORD = 4;
/**
* 中文单字
*/
static final int TYPE_CNCHAR = 64;
/**
* 日韩文字
*/
static final int TYPE_OTHER_CJK = 8;
/**
* 中文数词
*/
static final int TYPE_CNUM = 16;
/**
* 中文量词
*/
static final int TYPE_COUNT = 32;
/**
* 中文数量词
*/
static final int TYPE_CQUAN = 48;
/**
* 词元的起始位移
*/
private int offset;
/**
* 词元的相对起始位置
*/
private int begin;
//词元的长度
/**
* 词元的长度
*/
private int length;
//词元文本
/**
* 词元文本
*/
private String lexemeText;
//词元类型
/**
* 词元类型
*/
private int lexemeType;
public Lexeme(int offset , int begin , int length , int lexemeType){
this.offset = offset;
this.begin = begin;
if(length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
this.lexemeType = lexemeType;
}
public Lexeme(int offset, int begin, int length, int lexemeType) {
this.offset = offset;
this.begin = begin;
if (length < 0) {
throw new IllegalArgumentException("length < 0");
}
this.length = length;
this.lexemeType = lexemeType;
}
/*
* 判断词元相等算法
* 起始位置偏移起始位置终止位置相同
* @see java.lang.Object#equals(Object o)
*/
public boolean equals(Object o){
if(o == null){
return false;
}
public boolean equals(Object o) {
if (o == null) {
return false;
}
if(this == o){
return true;
}
if (this == o) {
return true;
}
if(o instanceof Lexeme){
Lexeme other = (Lexeme)o;
return this.offset == other.getOffset()
&& this.begin == other.getBegin()
&& this.length == other.getLength();
}else{
return false;
}
}
if (o instanceof Lexeme) {
Lexeme other = (Lexeme) o;
return this.offset == other.getOffset()
&& this.begin == other.getBegin()
&& this.length == other.getLength();
} else {
return false;
}
}
/*
* 词元哈希编码算法
* @see java.lang.Object#hashCode()
*/
public int hashCode(){
int absBegin = getBeginPosition();
int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
public int hashCode() {
int absBegin = getBeginPosition();
int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
}
/*
* 词元在排序集合中的比较算法
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Lexeme other) {
//起始位置优先
if(this.begin < other.getBegin()){
public int compareTo(Lexeme other) {
// 起始位置优先
if (this.begin < other.getBegin()) {
return -1;
}else if(this.begin == other.getBegin()){
//词元长度优先
//this.length < other.getLength()
return Integer.compare(other.getLength(), this.length);
} else if (this.begin == other.getBegin()) {
// 词元长度优先
// this.length < other.getLength()
return Integer.compare(other.getLength(), this.length);
}else{//this.begin > other.getBegin()
return 1;
} else {
return 1;
}
}
}
private int getOffset() {
return offset;
}
private int getOffset() {
return offset;
}
public void setOffset(int offset) {
this.offset = offset;
}
public void setOffset(int offset) {
this.offset = offset;
}
int getBegin() {
return begin;
}
/**
* 获取词元在文本中的起始位置
* @return int
*/
public int getBeginPosition(){
return offset + begin;
}
int getBegin() {
return begin;
}
public void setBegin(int begin) {
this.begin = begin;
}
/**
* 获取词元在文本中的起始位置
*
* @return int
*/
public int getBeginPosition() {
return offset + begin;
}
/**
* 获取词元在文本中的结束位置
* @return int
*/
public int getEndPosition(){
return offset + begin + length;
}
public void setBegin(int begin) {
this.begin = begin;
}
/**
* 获取词元的字符长度
* @return int
*/
public int getLength(){
return this.length;
}
/**
* 获取词元在文本中的结束位置
*
* @return int
*/
public int getEndPosition() {
return offset + begin + length;
}
public void setLength(int length) {
if(this.length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
}
/**
* 获取词元的字符长度
*
* @return int
*/
public int getLength() {
return this.length;
}
/**
* 获取词元的文本内容
* @return String
*/
public String getLexemeText() {
if(lexemeText == null){
return "";
}
return lexemeText;
}
public void setLength(int length) {
if (this.length < 0) {
throw new IllegalArgumentException("length < 0");
}
this.length = length;
}
void setLexemeText(String lexemeText) {
if(lexemeText == null){
this.lexemeText = "";
this.length = 0;
}else{
this.lexemeText = lexemeText;
this.length = lexemeText.length();
}
}
/**
* 获取词元的文本内容
*
* @return String
*/
public String getLexemeText() {
if (lexemeText == null) {
return "";
}
return lexemeText;
}
/**
* 获取词元类型
* @return int
*/
int getLexemeType() {
return lexemeType;
}
void setLexemeText(String lexemeText) {
if (lexemeText == null) {
this.lexemeText = "";
this.length = 0;
} else {
this.lexemeText = lexemeText;
this.length = lexemeText.length();
}
}
/**
* 获取词元类型标示字符串
* @return String
*/
public String getLexemeTypeString(){
switch(lexemeType) {
/**
* 获取词元类型
*
* @return int
*/
int getLexemeType() {
return lexemeType;
}
case TYPE_ENGLISH :
return "ENGLISH";
/**
* 获取词元类型标示字符串
*
* @return String
*/
public String getLexemeTypeString() {
switch (lexemeType) {
case TYPE_ARABIC :
return "ARABIC";
case TYPE_ENGLISH:
return "ENGLISH";
case TYPE_LETTER :
return "LETTER";
case TYPE_ARABIC:
return "ARABIC";
case TYPE_CNWORD :
return "CN_WORD";
case TYPE_LETTER:
return "LETTER";
case TYPE_CNCHAR :
return "CN_CHAR";
case TYPE_CNWORD:
return "CN_WORD";
case TYPE_OTHER_CJK :
return "OTHER_CJK";
case TYPE_CNCHAR:
return "CN_CHAR";
case TYPE_COUNT :
return "COUNT";
case TYPE_OTHER_CJK:
return "OTHER_CJK";
case TYPE_CNUM :
return "TYPE_CNUM";
case TYPE_COUNT:
return "COUNT";
case TYPE_CQUAN:
return "TYPE_CQUAN";
case TYPE_CNUM:
return "TYPE_CNUM";
default :
return "UNKONW";
}
}
case TYPE_CQUAN:
return "TYPE_CQUAN";
default:
return "UNKNOWN";
}
}
public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType;
}
public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType;
}
/**
* 合并两个相邻的词元
* @return boolean 词元是否成功合并
*/
boolean append(Lexeme l, int lexemeType){
if(l != null && this.getEndPosition() == l.getBeginPosition()){
this.length += l.getLength();
this.lexemeType = lexemeType;
return true;
}else {
return false;
}
}
/**
* 合并两个相邻的词元
*
* @return boolean 词元是否成功合并
*/
boolean append(Lexeme l, int lexemeType) {
if (l != null && this.getEndPosition() == l.getBeginPosition()) {
this.length += l.getLength();
this.lexemeType = lexemeType;
return true;
} else {
return false;
}
}
/**
*
*/
public String toString(){
return this.getBeginPosition() + "-" + this.getEndPosition() +
" : " + this.lexemeText + " : \t" +
this.getLexemeTypeString();
}
/**
* ToString 方法
*
* @return 字符串输出
*/
public String toString() {
return this.getBeginPosition() + "-" + this.getEndPosition() +
" : " + this.lexemeText + " : \t" +
this.getLexemeTypeString();
}
}

View File

@ -34,11 +34,17 @@ package org.wltea.analyzer.core;
@SuppressWarnings("unused")
class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
//起始位置
/**
* 起始位置
*/
private int pathBegin;
//结束
/**
* 结束
*/
private int pathEnd;
//词元链的有效字符长度
/**
* 词元链的有效字符长度
*/
private int payloadLength;
LexemePath() {
@ -100,7 +106,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* 移除尾部的Lexeme
*
*/
void removeTail() {
Lexeme tail = this.pollLast();
@ -117,7 +122,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* 检测词元位置交叉有歧义的切分
*
*/
boolean checkCross(Lexeme lexeme) {
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
@ -141,7 +145,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* 获取LexemePath的路径长度
*
*/
private int getPathLength() {
return this.pathEnd - this.pathBegin;
@ -150,7 +153,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* X权重词元长度积
*
*/
private int getXWeight() {
int product = 1;
@ -191,48 +193,48 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
}
public int compareTo(LexemePath o) {
//比较有效文本长度
// 比较有效文本长度
if (this.payloadLength > o.payloadLength) {
return -1;
} else if (this.payloadLength < o.payloadLength) {
return 1;
} else {
//比较词元个数越少越好
if (this.size() < o.size()) {
return -1;
} else if (this.size() > o.size()) {
return 1;
} else {
//路径跨度越大越好
if (this.getPathLength() > o.getPathLength()) {
return -1;
} else if (this.getPathLength() < o.getPathLength()) {
return 1;
} else {
//根据统计学结论逆向切分概率高于正向切分因此位置越靠后的优先
if (this.pathEnd > o.pathEnd) {
return -1;
} else if (pathEnd < o.pathEnd) {
return 1;
} else {
//词长越平均越好
if (this.getXWeight() > o.getXWeight()) {
return -1;
} else if (this.getXWeight() < o.getXWeight()) {
return 1;
} else {
//词元位置权重比较
if (this.getPWeight() > o.getPWeight()) {
return -1;
} else if (this.getPWeight() < o.getPWeight()) {
return 1;
}
}
}
}
}
}
// 比较词元个数越少越好
if (this.size() < o.size()) {
return -1;
} else if (this.size() > o.size()) {
return 1;
}
// 路径跨度越大越好
if (this.getPathLength() > o.getPathLength()) {
return -1;
} else if (this.getPathLength() < o.getPathLength()) {
return 1;
}
// 根据统计学结论逆向切分概率高于正向切分因此位置越靠后的优先
if (this.pathEnd > o.pathEnd) {
return -1;
} else if (pathEnd < o.pathEnd) {
return 1;
}
// 词长越平均越好
if (this.getXWeight() > o.getXWeight()) {
return -1;
} else if (this.getXWeight() < o.getXWeight()) {
return 1;
}
// 词元位置权重比较
if (this.getPWeight() > o.getPWeight()) {
return -1;
} else if (this.getPWeight() < o.getPWeight()) {
return 1;
}
return 0;
}

View File

@ -28,14 +28,20 @@
package org.wltea.analyzer.core;
/**
* IK分词器专用的Lexem快速排序集合
* IK分词器专用的Lexeme快速排序集合
*/
class QuickSortSet {
//链表头
/**
* 链表头
*/
private Cell head;
//链表尾
/**
* 链表尾
*/
private Cell tail;
//链表的实际大小
/**
* 链表的实际大小
*/
private int size;
QuickSortSet() {
@ -53,31 +59,29 @@ class QuickSortSet {
this.size++;
} else {
/*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同不放入集合
}else */
if (this.tail.compareTo(newCell) < 0) {//词元接入链表尾部
if (this.tail.compareTo(newCell) < 0) {
// 词元接入链表尾部
this.tail.next = newCell;
newCell.prev = this.tail;
this.tail = newCell;
this.size++;
} else if (this.head.compareTo(newCell) > 0) {//词元接入链表头部
} else if (this.head.compareTo(newCell) > 0) {
// 词元接入链表头部
this.head.prev = newCell;
newCell.next = this.head;
this.head = newCell;
this.size++;
} else {
//从尾部上逆
// 从尾部上逆
Cell index = this.tail;
while (index != null && index.compareTo(newCell) > 0) {
index = index.prev;
}
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复不放入集合
}else */
if ((index != null ? index.compareTo(newCell) : 1) < 0) {//词元插入链表中的某个位置
// 词元插入链表中的某个位置
if ((index != null ? index.compareTo(newCell) : 1) < 0) {
newCell.prev = index;
newCell.next = index.next;
index.next.prev = newCell;

View File

@ -37,24 +37,38 @@ import java.util.Map;
@SuppressWarnings("unused")
class DictSegment implements Comparable<DictSegment> {
//公用字典表存储汉字
/**
* 公用字典表存储汉字
*/
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
//数组大小上限
/**
* 数组大小上限
*/
private static final int ARRAY_LENGTH_LIMIT = 3;
//Map存储结构
private Map<Character, DictSegment> childrenMap;
//数组方式存储结构
private DictSegment[] childrenArray;
/**
* Map存储结构
*/
private volatile Map<Character, DictSegment> childrenMap;
/**
* 数组方式存储结构
*/
private volatile DictSegment[] childrenArray;
//当前节点上存储的字符
private Character nodeChar;
//当前节点存储的Segment数目
//storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
/**
* 当前节点上存储的字符
*/
private final Character nodeChar;
/**
* 当前节点存储的Segment数目
* storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
*/
private int storeSize = 0;
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
/**
* 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
*/
private int nodeState = 0;

View File

@ -27,14 +27,14 @@
*/
package org.wltea.analyzer.dic;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
/**
* 词典管理类单例模式
*/
@ -44,7 +44,7 @@ public class Dictionary {
/*
* 词典单子实例
*/
private static Dictionary singleton;
private static volatile Dictionary singleton;
/*
* 主词典对象
@ -63,7 +63,7 @@ public class Dictionary {
/**
* 配置对象
*/
private Configuration cfg;
private final Configuration cfg;
/**
* 私有构造方法阻止外部直接实例化本类
@ -326,7 +326,7 @@ public class Dictionary {
// 建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary());
if (is == null) {
throw new RuntimeException("Quantifier Dictionary not found!!!");
}

View File

@ -32,24 +32,33 @@ package org.wltea.analyzer.dic;
*/
@SuppressWarnings("unused")
public class Hit {
//Hit不匹配
/**
* Hit不匹配
*/
private static final int UNMATCH = 0x00000000;
//Hit完全匹配
/**
* Hit完全匹配
*/
private static final int MATCH = 0x00000001;
//Hit前缀匹配
/**
* Hit前缀匹配
*/
private static final int PREFIX = 0x00000010;
//该HIT当前状态默认未匹配
/**
* 该HIT当前状态默认未匹配
*/
private int hitState = UNMATCH;
//记录词典匹配过程中当前匹配到的词典分支节点
/**
* 记录词典匹配过程中当前匹配到的词典分支节点
*/
private DictSegment matchedDictSegment;
/*
/**
* 词段开始位置
*/
private int begin;
/*
/**
* 词段的结束位置
*/
private int end;
@ -86,9 +95,7 @@ public class Hit {
public boolean isUnmatch() {
return this.hitState == UNMATCH ;
}
/**
*
*/
void setUnmatch() {
this.hitState = UNMATCH;
}

View File

@ -36,7 +36,7 @@ import org.apache.lucene.analysis.Tokenizer;
@SuppressWarnings("unused")
public final class IKAnalyzer extends Analyzer {
private boolean useSmart;
private final boolean useSmart;
private boolean useSmart() {
return useSmart;

View File

@ -39,21 +39,30 @@ import java.io.IOException;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
@SuppressWarnings("unused")
@SuppressWarnings({"unused", "FinalMethodInFinalClass"})
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
/**
* IK分词器实现
*/
private IKSegmenter _IKImplement;
//词元文本属性
/**
* 词元文本属性
*/
private CharTermAttribute termAtt;
//词元位移属性
/**
* 词元位移属性
*/
private OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
/**
* 词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
*/
private TypeAttribute typeAtt;
//记录最后一个词元的结束位置
/**
* 记录最后一个词元的结束位置
*/
private int endPosition;
/**
@ -84,30 +93,31 @@ public final class IKTokenizer extends Tokenizer {
_IKImplement = new IKSegmenter(input, useSmart);
}
/* (non-Javadoc)
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
// 清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
//将Lexeme转成Attributes
//设置词元文本
// 将Lexeme转成Attributes
// 设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
// 设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
// 设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
// 记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
// 返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
// 返会false告知词元输出完毕
return false;
}

View File

@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
* 8.5.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.lucene;
@ -44,6 +44,8 @@ import java.nio.charset.StandardCharsets;
import java.util.*;
/**
* 分词器工厂类
*
* @author <a href="magese@live.cn">Magese</a>
*/
public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateThread.UpdateJob {

View File

@ -46,11 +46,11 @@ import java.util.Stack;
public class IKQueryExpressionParser {
private List<Element> elements = new ArrayList<>();
private final List<Element> elements = new ArrayList<>();
private Stack<Query> querys = new Stack<>();
private final Stack<Query> querys = new Stack<>();
private Stack<Element> operates = new Stack<>();
private final Stack<Element> operates = new Stack<>();
/**
* 解析查询表达式生成Lucene Query对象
@ -61,9 +61,9 @@ public class IKQueryExpressionParser {
Query lucenceQuery = null;
if (expression != null && !"".equals(expression.trim())) {
try {
//文法解析
// 文法解析
this.splitElements(expression);
//语法解析
// 语法解析
this.parseSyntax();
if (this.querys.size() == 1) {
lucenceQuery = this.querys.pop();
@ -87,263 +87,263 @@ public class IKQueryExpressionParser {
if (expression == null) {
return;
}
Element curretElement = null;
Element currentElement = null;
char[] expChars = expression.toCharArray();
for (char expChar : expChars) {
switch (expChar) {
case '&':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChar);
} else if (curretElement.type == '&') {
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement == null) {
currentElement = new Element();
currentElement.type = '&';
currentElement.append(expChar);
} else if (currentElement.type == '&') {
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
} else if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChar);
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = '&';
currentElement.append(expChar);
}
break;
case '|':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChar);
} else if (curretElement.type == '|') {
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement == null) {
currentElement = new Element();
currentElement.type = '|';
currentElement.append(expChar);
} else if (currentElement.type == '|') {
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
} else if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChar);
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = '|';
currentElement.append(expChar);
}
break;
case '-':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '-';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '-';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '(':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '(';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '(';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ')':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ')';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ')';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ':':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ':';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ':';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '=':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '=';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '=';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ' ':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = null;
this.elements.add(currentElement);
currentElement = null;
}
}
break;
case '\'':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '\'';
if (currentElement == null) {
currentElement = new Element();
currentElement.type = '\'';
} else if (curretElement.type == '\'') {
this.elements.add(curretElement);
curretElement = null;
} else if (currentElement.type == '\'') {
this.elements.add(currentElement);
currentElement = null;
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '\'';
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = '\'';
}
break;
case '[':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '[';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '[';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ']':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ']';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ']';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '{':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '{';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '{';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '}':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '}';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '}';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ',':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ',';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ',';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
default:
if (curretElement == null) {
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChar);
if (currentElement == null) {
currentElement = new Element();
currentElement.type = 'F';
currentElement.append(expChar);
} else if (curretElement.type == 'F') {
curretElement.append(expChar);
} else if (currentElement.type == 'F') {
currentElement.append(expChar);
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
} else if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChar);
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = 'F';
currentElement.append(expChar);
}
}
}
if (curretElement != null) {
this.elements.add(curretElement);
if (currentElement != null) {
this.elements.add(currentElement);
}
}
@ -359,7 +359,7 @@ public class IKQueryExpressionParser {
throw new IllegalStateException("表达式异常: = 或 号丢失");
}
Element e3 = this.elements.get(i + 2);
//处理 = 运算
// 处理 = 运算
if ('\'' == e3.type) {
i += 2;
if ('=' == e2.type) {
@ -367,14 +367,14 @@ public class IKQueryExpressionParser {
this.querys.push(tQuery);
} else {
String keyword = e3.toString();
//SWMCQuery Here
// SWMCQuery Here
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword);
this.querys.push(_SWMCQuery);
}
} else if ('[' == e3.type || '{' == e3.type) {
i += 2;
//处理 [] {}
// 处理 [] {}
LinkedList<Element> eQueue = new LinkedList<>();
eQueue.add(e3);
for (i++; i < this.elements.size(); i++) {
@ -384,7 +384,7 @@ public class IKQueryExpressionParser {
break;
}
}
//翻译RangeQuery
// 翻译RangeQuery
Query rangeQuery = this.toTermRangeQuery(e, eQueue);
this.querys.push(rangeQuery);
} else {
@ -475,10 +475,10 @@ public class IKQueryExpressionParser {
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
// q1 instanceof TermQuery
// q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery
// others
resultQuery.add(q1, Occur.MUST);
}
}
@ -496,10 +496,10 @@ public class IKQueryExpressionParser {
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
// q1 instanceof TermQuery
// q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery
// others
resultQuery.add(q2, Occur.MUST);
}
}
@ -518,10 +518,10 @@ public class IKQueryExpressionParser {
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
// q1 instanceof TermQuery
// q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery
// others
resultQuery.add(q1, Occur.SHOULD);
}
}
@ -538,10 +538,10 @@ public class IKQueryExpressionParser {
resultQuery.add(q2, Occur.SHOULD);
}
} else {
//q2 instanceof TermQuery
//q2 instanceof TermRangeQuery
//q2 instanceof PhraseQuery
//others
// q2 instanceof TermQuery
// q2 instanceof TermRangeQuery
// q2 instanceof PhraseQuery
// others
resultQuery.add(q2, Occur.SHOULD);
}
@ -563,10 +563,10 @@ public class IKQueryExpressionParser {
}
} else {
//q1 instanceof TermQuery
//q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery
//others
// q1 instanceof TermQuery
// q1 instanceof TermRangeQuery
// q1 instanceof PhraseQuery
// others
resultQuery.add(q1, Occur.MUST);
}
@ -584,7 +584,7 @@ public class IKQueryExpressionParser {
boolean includeLast;
String firstValue;
String lastValue = null;
//检查第一个元素是否是[或者{
// 检查第一个元素是否是[或者{
Element first = elements.getFirst();
if ('[' == first.type) {
includeFirst = true;
@ -593,7 +593,7 @@ public class IKQueryExpressionParser {
} else {
throw new IllegalStateException("表达式异常");
}
//检查最后一个元素是否是]或者}
// 检查最后一个元素是否是]或者}
Element last = elements.getLast();
if (']' == last.type) {
includeLast = true;
@ -605,7 +605,7 @@ public class IKQueryExpressionParser {
if (elements.size() < 4 || elements.size() > 5) {
throw new IllegalStateException("表达式异常, RangeQuery 错误");
}
//读出中间部分
// 读出中间部分
Element e2 = elements.get(1);
if ('\'' == e2.type) {
firstValue = e2.toString();
@ -673,7 +673,7 @@ public class IKQueryExpressionParser {
* @author linliangyi
* May 20, 2010
*/
private class Element {
private static class Element {
char type = 0;
StringBuffer eleTextBuff;
@ -692,11 +692,9 @@ public class IKQueryExpressionParser {
public static void main(String[] args) {
IKQueryExpressionParser parser = new IKQueryExpressionParser();
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
Query result = parser.parseExp(ikQueryExp);
System.out.println(result);
}
}

View File

@ -45,6 +45,7 @@ import java.util.List;
*
* @author linliangyi
*/
@SuppressWarnings("unused")
class SWMCQueryBuilder {
/**
@ -56,9 +57,9 @@ class SWMCQueryBuilder {
if (fieldName == null || keywords == null) {
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
}
//1.对keywords进行分词处理
// 1.对keywords进行分词处理
List<Lexeme> lexemes = doAnalyze(keywords);
//2.根据分词结果生成SWMCQuery
// 2.根据分词结果生成SWMCQuery
return getSWMCQuery(fieldName, lexemes);
}
@ -84,20 +85,20 @@ class SWMCQueryBuilder {
* 根据分词结果生成SWMC搜索
*/
private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) {
//构造SWMC的查询表达式
// 构造SWMC的查询表达式
StringBuilder keywordBuffer = new StringBuilder();
//精简的SWMC的查询表达式
// 精简的SWMC的查询表达式
StringBuilder keywordBuffer_Short = new StringBuilder();
//记录最后词元长度
// 记录最后词元长度
int lastLexemeLength = 0;
//记录最后词元结束位置
// 记录最后词元结束位置
int lastLexemeEnd = -1;
int shortCount = 0;
int totalCount = 0;
for (Lexeme l : lexemes) {
totalCount += l.getLength();
//精简表达式
// 精简表达式
if (l.getLength() > 1) {
keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength();
@ -106,7 +107,7 @@ class SWMCQueryBuilder {
if (lastLexemeLength == 0) {
keywordBuffer.append(l.getLexemeText());
} else if (lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()) {//单字位置相邻长度为一合并)
&& lastLexemeEnd == l.getBeginPosition()) {// 单字位置相邻长度为一合并)
keywordBuffer.append(l.getLexemeText());
} else {
keywordBuffer.append(' ').append(l.getLexemeText());
@ -116,10 +117,10 @@ class SWMCQueryBuilder {
lastLexemeEnd = l.getEndPosition();
}
//借助lucene queryparser 生成SWMC Query
// 借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
qp.setAutoGeneratePhraseQueries(false);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if ((shortCount * 1.0f / totalCount) > 0.5f) {
try {