Compare commits

...

24 Commits

Author SHA1 Message Date
Magese
f0059e2c78 Resolve QueryParser exception. 2022-01-04 10:36:49 +08:00
Magese
fb4defedb7 Format comments. 2022-01-04 09:59:25 +08:00
Magese
9ed56a0f41 Format comments. 2022-01-04 09:56:41 +08:00
Magese
70d40bc3af Using final. 2022-01-04 09:51:21 +08:00
Magese
ef2dbeb979 Format comments. 2022-01-04 09:48:28 +08:00
Magese
872aac8298 Using final and volatile. 2022-01-04 09:44:12 +08:00
Magese
052e8f476e Package sorting. 2022-01-04 09:36:59 +08:00
Magese
6cc121e98d Remove HitCount Badges. 2021-12-31 17:56:33 +08:00
Magese
32a9369680 注释格式化; 2021-12-31 17:51:35 +08:00
Magese
dd7822b6be 优化排序算法逻辑; 2021-12-31 17:48:21 +08:00
Magese
6ef4798752 代码及注释格式化; 2021-12-31 17:43:40 +08:00
Magese
56f23a9027 注释格式化; 2021-12-31 17:38:43 +08:00
Magese
5ab517079b 注释格式化; 2021-12-31 17:36:34 +08:00
Magese
020f83e665 构造初始化成员变量声明为 final; 2021-12-31 17:35:32 +08:00
Magese
ab50f161e6 注释格式化; 2021-12-31 17:33:07 +08:00
Magese
47439fa94b 注释格式化; 2021-12-31 17:31:28 +08:00
Magese
c938bf1f2b 成员变量申明为 final,逻辑判断优化; 2021-12-31 17:29:59 +08:00
Magese
92cb2a28d6 格式化代码、注释; 2021-12-31 17:27:42 +08:00
Magese
f173925dc0 优化词前置判断逻辑; 2021-12-31 17:17:47 +08:00
Magese
f9bc7a12fa 代码、注释格式化; 2021-12-31 17:14:07 +08:00
Magese
df29bdc4df 代码格式化; 2021-12-31 17:11:52 +08:00
Magese
3ec8076730 日志格式优化; 2021-12-31 17:10:19 +08:00
Magese
7149c54de7 单例对象使用 volatile 关键字; 2021-12-31 17:02:02 +08:00
Magese
fff131a45a 修复单词拼写错误; 2021-12-31 16:59:38 +08:00
22 changed files with 999 additions and 919 deletions

View File

@ -6,7 +6,6 @@ ik-analyzer for solr 7.x-8.x
[![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://github.com/magese/ik-analyzer-solr/releases)
[![Crates.io](https://img.shields.io/crates/l/rustc-serialize.svg)](./LICENSE)
[![Build Status](https://travis-ci.org/magese/ik-analyzer-solr.svg?branch=master)](https://travis-ci.org/magese/ik-analyzer-solr)
[![HitCount](http://hits.dwyl.io/magese/ik-analyzer-solr.svg)](http://hits.dwyl.io/magese/ik-analyzer-solr)
[![GitHub forks](https://img.shields.io/github/forks/magese/ik-analyzer-solr.svg?style=social&label=Fork)](https://github.com/magese/ik-analyzer-solr/network/members)
[![GitHub stars](https://img.shields.io/github/stars/magese/ik-analyzer-solr.svg?style=social&label=Star)](https://github.com/magese/ik-analyzer-solr/stargazers)

View File

@ -76,7 +76,7 @@ public interface Configuration {
*
* @return String 量词词典路径
*/
String getQuantifierDicionary();
String getQuantifierDictionary();
/**
* 获取扩展字典配置路径

View File

@ -145,7 +145,7 @@ public class DefaultConfig implements Configuration {
*
* @return String 量词词典路径
*/
public String getQuantifierDicionary() {
public String getQuantifierDictionary() {
return PATH_DIC_QUANTIFIER;
}

View File

@ -27,12 +27,12 @@
*/
package org.wltea.analyzer.core;
import java.util.LinkedList;
import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList;
import java.util.List;
/**
* 中文-日韩文子分词器
@ -42,7 +42,7 @@ class CJKSegmenter implements ISegmenter {
// 子分词器标签
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
// 待处理的分词hit队列
private List<Hit> tmpHits;
private final List<Hit> tmpHits;
CJKSegmenter() {
@ -80,17 +80,16 @@ class CJKSegmenter implements ISegmenter {
// *********************************
// 再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
// 首字为词前缀
if (singleCharHit.isMatch()) {
// 输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
//同时也是词前缀
if(singleCharHit.isPrefix()){
//前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为词前缀
// 前缀匹配则放入hit列表
if (singleCharHit.isPrefix()) {
// 前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}

View File

@ -36,7 +36,6 @@ import java.util.List;
import java.util.Set;
/**
*
* 中文数量词子分词器
*/
class CN_QuantifierSegmenter implements ISegmenter {
@ -44,14 +43,14 @@ class CN_QuantifierSegmenter implements ISegmenter{
// 子分词器标签
private static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
private static Set<Character> ChnNumberChars = new HashSet<>();
private static final Set<Character> CHN_NUMBER_CHARS = new HashSet<>();
static {
// 中文数词
//Cnum
String chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
char[] ca = chn_Num.toCharArray();
for (char nChar : ca) {
ChnNumberChars.add(nChar);
CHN_NUMBER_CHARS.add(nChar);
}
}
@ -68,7 +67,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
private int nEnd;
// 待处理的量词hit队列
private List<Hit> countHits;
private final List<Hit> countHits;
CN_QuantifierSegmenter() {
@ -111,14 +110,14 @@ class CN_QuantifierSegmenter implements ISegmenter{
private void processCNumber(AnalyzeContext context) {
if (nStart == -1 && nEnd == -1) {// 初始状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
// 记录数词的起始结束位置
nStart = context.getCursor();
nEnd = context.getCursor();
}
} else {// 正在处理状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())){
&& CHN_NUMBER_CHARS.contains(context.getCurrentChar())) {
// 记录数词的结束位置
nEnd = context.getCursor();
} else {
@ -144,6 +143,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
/**
* 处理中文量词
*
* @param context 需要处理的内容
*/
private void processCount(AnalyzeContext context) {
@ -179,21 +179,19 @@ class CN_QuantifierSegmenter implements ISegmenter{
// *********************************
// 对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
// 首字为量词前缀
if (singleCharHit.isMatch()) {
// 输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
}
//同时也是词前缀
// 前缀匹配则放入hit列表
if (singleCharHit.isPrefix()) {
// 前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
}else if(singleCharHit.isPrefix()){//首字为量词前缀
//前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
} else {
// 输入的不是中文字符
@ -229,6 +227,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
/**
* 添加数词词元到结果集
*
* @param context 需要添加的词元
*/
private void outputNumLexeme(AnalyzeContext context) {

View File

@ -28,7 +28,6 @@
package org.wltea.analyzer.core;
/**
*
* 字符集识别工具类
*/
class CharacterUtil {
@ -46,6 +45,7 @@ class CharacterUtil {
/**
* 识别字符类型
*
* @param input 需要识别的字符
* @return int CharacterUtil定义的字符类型常量
*/
@ -85,6 +85,7 @@ class CharacterUtil {
/**
* 进行字符规格化全角转半角大写转小写处理
*
* @param input 需要转换的字符
* @return char
*/

View File

@ -35,9 +35,7 @@ import java.util.TreeSet;
*/
class IKArbitrator {
IKArbitrator() {
}
IKArbitrator() {}
/**
* 分词歧义处理

View File

@ -41,15 +41,25 @@ import java.util.List;
*/
public final class IKSegmenter {
//字符窜reader
/**
* 字符窜reader
*/
private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文
/**
* 分词器配置项
*/
private final Configuration cfg;
/**
* 分词器上下文
*/
private AnalyzeContext context;
//分词处理器列表
/**
* 分词处理器列表
*/
private List<ISegmenter> segmenters;
//分词歧义裁决器
/**
* 分词歧义裁决器
*/
private IKArbitrator arbitrator;

View File

@ -29,13 +29,13 @@ package org.wltea.analyzer.core;
/**
*
* 子分词器接口
*/
interface ISegmenter {
/**
* 从分析器读取下一个可能分解的词元对象
*
* @param context 分词算法上下文
*/
void analyze(AnalyzeContext context);

View File

@ -34,14 +34,18 @@ import java.util.Arrays;
*/
class LetterSegmenter implements ISegmenter {
//子分词器标签
/**
* 子分词器标签
*/
private static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
//链接符号
/**
* 链接符号
*/
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
//数字符号
/**
* 数字符号
*/
private static final char[] Num_Connector = new char[]{',', '.'};
/*
* 词元的开始位置
* 同时作为子分词器状态标识
@ -53,22 +57,18 @@ class LetterSegmenter implements ISegmenter {
* end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
*/
private int end;
/*
* 字母起始位置
*/
private int englishStart;
/*
* 字母结束位置
*/
private int englishEnd;
/*
* 阿拉伯数字起始位置
*/
private int arabicStart;
/*
* 阿拉伯数字结束位置
*/

View File

@ -32,34 +32,61 @@ package org.wltea.analyzer.core;
*/
@SuppressWarnings("unused")
public class Lexeme implements Comparable<Lexeme> {
//英文
/**
* 英文
*/
static final int TYPE_ENGLISH = 1;
//数字
/**
* 数字
*/
static final int TYPE_ARABIC = 2;
//英文数字混合
/**
* 英文数字混合
*/
static final int TYPE_LETTER = 3;
//中文词元
/**
* 中文词元
*/
static final int TYPE_CNWORD = 4;
//中文单字
/**
* 中文单字
*/
static final int TYPE_CNCHAR = 64;
//日韩文字
/**
* 日韩文字
*/
static final int TYPE_OTHER_CJK = 8;
//中文数词
/**
* 中文数词
*/
static final int TYPE_CNUM = 16;
//中文量词
/**
* 中文量词
*/
static final int TYPE_COUNT = 32;
//中文数量词
/**
* 中文数量词
*/
static final int TYPE_CQUAN = 48;
//词元的起始位移
/**
* 词元的起始位移
*/
private int offset;
//词元的相对起始位置
/**
* 词元的相对起始位置
*/
private int begin;
//词元的长度
/**
* 词元的长度
*/
private int length;
//词元文本
/**
* 词元文本
*/
private String lexemeText;
//词元类型
/**
* 词元类型
*/
private int lexemeType;
@ -120,7 +147,7 @@ public class Lexeme implements Comparable<Lexeme>{
// this.length < other.getLength()
return Integer.compare(other.getLength(), this.length);
}else{//this.begin > other.getBegin()
} else {
return 1;
}
}
@ -136,8 +163,10 @@ public class Lexeme implements Comparable<Lexeme>{
int getBegin() {
return begin;
}
/**
* 获取词元在文本中的起始位置
*
* @return int
*/
public int getBeginPosition() {
@ -150,6 +179,7 @@ public class Lexeme implements Comparable<Lexeme>{
/**
* 获取词元在文本中的结束位置
*
* @return int
*/
public int getEndPosition() {
@ -158,6 +188,7 @@ public class Lexeme implements Comparable<Lexeme>{
/**
* 获取词元的字符长度
*
* @return int
*/
public int getLength() {
@ -173,6 +204,7 @@ public class Lexeme implements Comparable<Lexeme>{
/**
* 获取词元的文本内容
*
* @return String
*/
public String getLexemeText() {
@ -194,6 +226,7 @@ public class Lexeme implements Comparable<Lexeme>{
/**
* 获取词元类型
*
* @return int
*/
int getLexemeType() {
@ -202,6 +235,7 @@ public class Lexeme implements Comparable<Lexeme>{
/**
* 获取词元类型标示字符串
*
* @return String
*/
public String getLexemeTypeString() {
@ -235,7 +269,7 @@ public class Lexeme implements Comparable<Lexeme>{
return "TYPE_CQUAN";
default:
return "UNKONW";
return "UNKNOWN";
}
}
@ -246,6 +280,7 @@ public class Lexeme implements Comparable<Lexeme>{
/**
* 合并两个相邻的词元
*
* @return boolean 词元是否成功合并
*/
boolean append(Lexeme l, int lexemeType) {
@ -258,9 +293,10 @@ public class Lexeme implements Comparable<Lexeme>{
}
}
/**
* ToString 方法
*
* @return 字符串输出
*/
public String toString() {
return this.getBeginPosition() + "-" + this.getEndPosition() +

View File

@ -34,11 +34,17 @@ package org.wltea.analyzer.core;
@SuppressWarnings("unused")
class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
//起始位置
/**
* 起始位置
*/
private int pathBegin;
//结束
/**
* 结束
*/
private int pathEnd;
//词元链的有效字符长度
/**
* 词元链的有效字符长度
*/
private int payloadLength;
LexemePath() {
@ -100,7 +106,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* 移除尾部的Lexeme
*
*/
void removeTail() {
Lexeme tail = this.pollLast();
@ -117,7 +122,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* 检测词元位置交叉有歧义的切分
*
*/
boolean checkCross(Lexeme lexeme) {
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
@ -141,7 +145,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* 获取LexemePath的路径长度
*
*/
private int getPathLength() {
return this.pathEnd - this.pathBegin;
@ -150,7 +153,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
/**
* X权重词元长度积
*
*/
private int getXWeight() {
int product = 1;
@ -196,31 +198,36 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
return -1;
} else if (this.payloadLength < o.payloadLength) {
return 1;
} else {
}
// 比较词元个数越少越好
if (this.size() < o.size()) {
return -1;
} else if (this.size() > o.size()) {
return 1;
} else {
}
// 路径跨度越大越好
if (this.getPathLength() > o.getPathLength()) {
return -1;
} else if (this.getPathLength() < o.getPathLength()) {
return 1;
} else {
}
// 根据统计学结论逆向切分概率高于正向切分因此位置越靠后的优先
if (this.pathEnd > o.pathEnd) {
return -1;
} else if (pathEnd < o.pathEnd) {
return 1;
} else {
}
// 词长越平均越好
if (this.getXWeight() > o.getXWeight()) {
return -1;
} else if (this.getXWeight() < o.getXWeight()) {
return 1;
} else {
}
// 词元位置权重比较
if (this.getPWeight() > o.getPWeight()) {
return -1;
@ -228,11 +235,6 @@ class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
return 1;
}
}
}
}
}
}
return 0;
}

View File

@ -28,14 +28,20 @@
package org.wltea.analyzer.core;
/**
* IK分词器专用的Lexem快速排序集合
* IK分词器专用的Lexeme快速排序集合
*/
class QuickSortSet {
//链表头
/**
* 链表头
*/
private Cell head;
//链表尾
/**
* 链表尾
*/
private Cell tail;
//链表的实际大小
/**
* 链表的实际大小
*/
private int size;
QuickSortSet() {
@ -53,16 +59,15 @@ class QuickSortSet {
this.size++;
} else {
/*if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同不放入集合
}else */
if (this.tail.compareTo(newCell) < 0) {//词元接入链表尾部
if (this.tail.compareTo(newCell) < 0) {
// 词元接入链表尾部
this.tail.next = newCell;
newCell.prev = this.tail;
this.tail = newCell;
this.size++;
} else if (this.head.compareTo(newCell) > 0) {//词元接入链表头部
} else if (this.head.compareTo(newCell) > 0) {
// 词元接入链表头部
this.head.prev = newCell;
newCell.next = this.head;
this.head = newCell;
@ -74,10 +79,9 @@ class QuickSortSet {
while (index != null && index.compareTo(newCell) > 0) {
index = index.prev;
}
/*if(index.compareTo(newCell) == 0){//词元与集合中的词元重复不放入集合
}else */
if ((index != null ? index.compareTo(newCell) : 1) < 0) {//词元插入链表中的某个位置
// 词元插入链表中的某个位置
if ((index != null ? index.compareTo(newCell) : 1) < 0) {
newCell.prev = index;
newCell.next = index.next;
index.next.prev = newCell;

View File

@ -37,24 +37,38 @@ import java.util.Map;
@SuppressWarnings("unused")
class DictSegment implements Comparable<DictSegment> {
//公用字典表存储汉字
/**
* 公用字典表存储汉字
*/
private static final Map<Character, Character> charMap = new HashMap<>(16, 0.95f);
//数组大小上限
/**
* 数组大小上限
*/
private static final int ARRAY_LENGTH_LIMIT = 3;
//Map存储结构
private Map<Character, DictSegment> childrenMap;
//数组方式存储结构
private DictSegment[] childrenArray;
/**
* Map存储结构
*/
private volatile Map<Character, DictSegment> childrenMap;
/**
* 数组方式存储结构
*/
private volatile DictSegment[] childrenArray;
//当前节点上存储的字符
private Character nodeChar;
//当前节点存储的Segment数目
//storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
/**
* 当前节点上存储的字符
*/
private final Character nodeChar;
/**
* 当前节点存储的Segment数目
* storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
*/
private int storeSize = 0;
//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
/**
* 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
*/
private int nodeState = 0;

View File

@ -27,14 +27,14 @@
*/
package org.wltea.analyzer.dic;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
/**
* 词典管理类单例模式
*/
@ -44,7 +44,7 @@ public class Dictionary {
/*
* 词典单子实例
*/
private static Dictionary singleton;
private static volatile Dictionary singleton;
/*
* 主词典对象
@ -63,7 +63,7 @@ public class Dictionary {
/**
* 配置对象
*/
private Configuration cfg;
private final Configuration cfg;
/**
* 私有构造方法阻止外部直接实例化本类
@ -326,7 +326,7 @@ public class Dictionary {
// 建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary());
if (is == null) {
throw new RuntimeException("Quantifier Dictionary not found!!!");
}

View File

@ -32,24 +32,33 @@ package org.wltea.analyzer.dic;
*/
@SuppressWarnings("unused")
public class Hit {
//Hit不匹配
/**
* Hit不匹配
*/
private static final int UNMATCH = 0x00000000;
//Hit完全匹配
/**
* Hit完全匹配
*/
private static final int MATCH = 0x00000001;
//Hit前缀匹配
/**
* Hit前缀匹配
*/
private static final int PREFIX = 0x00000010;
//该HIT当前状态默认未匹配
/**
* 该HIT当前状态默认未匹配
*/
private int hitState = UNMATCH;
//记录词典匹配过程中当前匹配到的词典分支节点
/**
* 记录词典匹配过程中当前匹配到的词典分支节点
*/
private DictSegment matchedDictSegment;
/*
/**
* 词段开始位置
*/
private int begin;
/*
/**
* 词段的结束位置
*/
private int end;
@ -86,9 +95,7 @@ public class Hit {
public boolean isUnmatch() {
return this.hitState == UNMATCH ;
}
/**
*
*/
void setUnmatch() {
this.hitState = UNMATCH;
}

View File

@ -36,7 +36,7 @@ import org.apache.lucene.analysis.Tokenizer;
@SuppressWarnings("unused")
public final class IKAnalyzer extends Analyzer {
private boolean useSmart;
private final boolean useSmart;
private boolean useSmart() {
return useSmart;

View File

@ -39,21 +39,30 @@ import java.io.IOException;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
@SuppressWarnings("unused")
@SuppressWarnings({"unused", "FinalMethodInFinalClass"})
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
/**
* IK分词器实现
*/
private IKSegmenter _IKImplement;
//词元文本属性
/**
* 词元文本属性
*/
private CharTermAttribute termAtt;
//词元位移属性
/**
* 词元位移属性
*/
private OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
/**
* 词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
*/
private TypeAttribute typeAtt;
//记录最后一个词元的结束位置
/**
* 记录最后一个词元的结束位置
*/
private int endPosition;
/**
@ -84,7 +93,8 @@ public final class IKTokenizer extends Tokenizer {
_IKImplement = new IKSegmenter(input, useSmart);
}
/* (non-Javadoc)
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override

View File

@ -21,8 +21,8 @@
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.3.1版本 Magese (magese@live.cn) 更新
* release 8.3.1 update by Magese(magese@live.cn)
* 8.5.0版本 Magese (magese@live.cn) 更新
* release 8.5.0 update by Magese(magese@live.cn)
*
*/
package org.wltea.analyzer.lucene;
@ -44,6 +44,8 @@ import java.nio.charset.StandardCharsets;
import java.util.*;
/**
* 分词器工厂类
*
* @author <a href="magese@live.cn">Magese</a>
*/
public class IKTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware, UpdateThread.UpdateJob {

View File

@ -46,11 +46,11 @@ import java.util.Stack;
public class IKQueryExpressionParser {
private List<Element> elements = new ArrayList<>();
private final List<Element> elements = new ArrayList<>();
private Stack<Query> querys = new Stack<>();
private final Stack<Query> querys = new Stack<>();
private Stack<Element> operates = new Stack<>();
private final Stack<Element> operates = new Stack<>();
/**
* 解析查询表达式生成Lucene Query对象
@ -87,263 +87,263 @@ public class IKQueryExpressionParser {
if (expression == null) {
return;
}
Element curretElement = null;
Element currentElement = null;
char[] expChars = expression.toCharArray();
for (char expChar : expChars) {
switch (expChar) {
case '&':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChar);
} else if (curretElement.type == '&') {
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement == null) {
currentElement = new Element();
currentElement.type = '&';
currentElement.append(expChar);
} else if (currentElement.type == '&') {
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
} else if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChar);
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = '&';
currentElement.append(expChar);
}
break;
case '|':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChar);
} else if (curretElement.type == '|') {
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement == null) {
currentElement = new Element();
currentElement.type = '|';
currentElement.append(expChar);
} else if (currentElement.type == '|') {
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
} else if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChar);
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = '|';
currentElement.append(expChar);
}
break;
case '-':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '-';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '-';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '(':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '(';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '(';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ')':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ')';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ')';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ':':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ':';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ':';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '=':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '=';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '=';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ' ':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = null;
this.elements.add(currentElement);
currentElement = null;
}
}
break;
case '\'':
if (curretElement == null) {
curretElement = new Element();
curretElement.type = '\'';
if (currentElement == null) {
currentElement = new Element();
currentElement.type = '\'';
} else if (curretElement.type == '\'') {
this.elements.add(curretElement);
curretElement = null;
} else if (currentElement.type == '\'') {
this.elements.add(currentElement);
currentElement = null;
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '\'';
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = '\'';
}
break;
case '[':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '[';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '[';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ']':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ']';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ']';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '{':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '{';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '{';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case '}':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = '}';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = '}';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
case ',':
if (curretElement != null) {
if (curretElement.type == '\'') {
curretElement.append(expChar);
if (currentElement != null) {
if (currentElement.type == '\'') {
currentElement.append(expChar);
continue;
} else {
this.elements.add(curretElement);
this.elements.add(currentElement);
}
}
curretElement = new Element();
curretElement.type = ',';
curretElement.append(expChar);
this.elements.add(curretElement);
curretElement = null;
currentElement = new Element();
currentElement.type = ',';
currentElement.append(expChar);
this.elements.add(currentElement);
currentElement = null;
break;
default:
if (curretElement == null) {
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChar);
if (currentElement == null) {
currentElement = new Element();
currentElement.type = 'F';
currentElement.append(expChar);
} else if (curretElement.type == 'F') {
curretElement.append(expChar);
} else if (currentElement.type == 'F') {
currentElement.append(expChar);
} else if (curretElement.type == '\'') {
curretElement.append(expChar);
} else if (currentElement.type == '\'') {
currentElement.append(expChar);
} else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChar);
this.elements.add(currentElement);
currentElement = new Element();
currentElement.type = 'F';
currentElement.append(expChar);
}
}
}
if (curretElement != null) {
this.elements.add(curretElement);
if (currentElement != null) {
this.elements.add(currentElement);
}
}
@ -673,7 +673,7 @@ public class IKQueryExpressionParser {
* @author linliangyi
* May 20, 2010
*/
private class Element {
private static class Element {
char type = 0;
StringBuffer eleTextBuff;
@ -692,11 +692,9 @@ public class IKQueryExpressionParser {
public static void main(String[] args) {
IKQueryExpressionParser parser = new IKQueryExpressionParser();
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
Query result = parser.parseExp(ikQueryExp);
System.out.println(result);
}
}

View File

@ -45,6 +45,7 @@ import java.util.List;
*
* @author linliangyi
*/
@SuppressWarnings("unused")
class SWMCQueryBuilder {
/**
@ -118,8 +119,8 @@ class SWMCQueryBuilder {
// 借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
qp.setAutoGeneratePhraseQueries(false);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if ((shortCount * 1.0f / totalCount) > 0.5f) {
try {