Resolve QueryParser exception.

This commit is contained in:
Magese 2022-01-04 10:36:49 +08:00
parent fb4defedb7
commit f0059e2c78
2 changed files with 191 additions and 192 deletions

View File

@ -46,11 +46,11 @@ import java.util.Stack;
public class IKQueryExpressionParser { public class IKQueryExpressionParser {
private List<Element> elements = new ArrayList<>(); private final List<Element> elements = new ArrayList<>();
private Stack<Query> querys = new Stack<>(); private final Stack<Query> querys = new Stack<>();
private Stack<Element> operates = new Stack<>(); private final Stack<Element> operates = new Stack<>();
/** /**
* 解析查询表达式生成Lucene Query对象 * 解析查询表达式生成Lucene Query对象
@ -61,9 +61,9 @@ public class IKQueryExpressionParser {
Query lucenceQuery = null; Query lucenceQuery = null;
if (expression != null && !"".equals(expression.trim())) { if (expression != null && !"".equals(expression.trim())) {
try { try {
//文法解析 // 文法解析
this.splitElements(expression); this.splitElements(expression);
//语法解析 // 语法解析
this.parseSyntax(); this.parseSyntax();
if (this.querys.size() == 1) { if (this.querys.size() == 1) {
lucenceQuery = this.querys.pop(); lucenceQuery = this.querys.pop();
@ -87,263 +87,263 @@ public class IKQueryExpressionParser {
if (expression == null) { if (expression == null) {
return; return;
} }
Element curretElement = null; Element currentElement = null;
char[] expChars = expression.toCharArray(); char[] expChars = expression.toCharArray();
for (char expChar : expChars) { for (char expChar : expChars) {
switch (expChar) { switch (expChar) {
case '&': case '&':
if (curretElement == null) { if (currentElement == null) {
curretElement = new Element(); currentElement = new Element();
curretElement.type = '&'; currentElement.type = '&';
curretElement.append(expChar); currentElement.append(expChar);
} else if (curretElement.type == '&') { } else if (currentElement.type == '&') {
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
} else if (curretElement.type == '\'') { } else if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = new Element(); currentElement = new Element();
curretElement.type = '&'; currentElement.type = '&';
curretElement.append(expChar); currentElement.append(expChar);
} }
break; break;
case '|': case '|':
if (curretElement == null) { if (currentElement == null) {
curretElement = new Element(); currentElement = new Element();
curretElement.type = '|'; currentElement.type = '|';
curretElement.append(expChar); currentElement.append(expChar);
} else if (curretElement.type == '|') { } else if (currentElement.type == '|') {
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
} else if (curretElement.type == '\'') { } else if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = new Element(); currentElement = new Element();
curretElement.type = '|'; currentElement.type = '|';
curretElement.append(expChar); currentElement.append(expChar);
} }
break; break;
case '-': case '-':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = '-'; currentElement.type = '-';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case '(': case '(':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = '('; currentElement.type = '(';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case ')': case ')':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = ')'; currentElement.type = ')';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case ':': case ':':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = ':'; currentElement.type = ':';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case '=': case '=':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = '='; currentElement.type = '=';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case ' ': case ' ':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
} }
} }
break; break;
case '\'': case '\'':
if (curretElement == null) { if (currentElement == null) {
curretElement = new Element(); currentElement = new Element();
curretElement.type = '\''; currentElement.type = '\'';
} else if (curretElement.type == '\'') { } else if (currentElement.type == '\'') {
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = new Element(); currentElement = new Element();
curretElement.type = '\''; currentElement.type = '\'';
} }
break; break;
case '[': case '[':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = '['; currentElement.type = '[';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case ']': case ']':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = ']'; currentElement.type = ']';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case '{': case '{':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = '{'; currentElement.type = '{';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case '}': case '}':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = '}'; currentElement.type = '}';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
case ',': case ',':
if (curretElement != null) { if (currentElement != null) {
if (curretElement.type == '\'') { if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
continue; continue;
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
curretElement = new Element(); currentElement = new Element();
curretElement.type = ','; currentElement.type = ',';
curretElement.append(expChar); currentElement.append(expChar);
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = null; currentElement = null;
break; break;
default: default:
if (curretElement == null) { if (currentElement == null) {
curretElement = new Element(); currentElement = new Element();
curretElement.type = 'F'; currentElement.type = 'F';
curretElement.append(expChar); currentElement.append(expChar);
} else if (curretElement.type == 'F') { } else if (currentElement.type == 'F') {
curretElement.append(expChar); currentElement.append(expChar);
} else if (curretElement.type == '\'') { } else if (currentElement.type == '\'') {
curretElement.append(expChar); currentElement.append(expChar);
} else { } else {
this.elements.add(curretElement); this.elements.add(currentElement);
curretElement = new Element(); currentElement = new Element();
curretElement.type = 'F'; currentElement.type = 'F';
curretElement.append(expChar); currentElement.append(expChar);
} }
} }
} }
if (curretElement != null) { if (currentElement != null) {
this.elements.add(curretElement); this.elements.add(currentElement);
} }
} }
@ -359,7 +359,7 @@ public class IKQueryExpressionParser {
throw new IllegalStateException("表达式异常: = 或 号丢失"); throw new IllegalStateException("表达式异常: = 或 号丢失");
} }
Element e3 = this.elements.get(i + 2); Element e3 = this.elements.get(i + 2);
//处理 = 运算 // 处理 = 运算
if ('\'' == e3.type) { if ('\'' == e3.type) {
i += 2; i += 2;
if ('=' == e2.type) { if ('=' == e2.type) {
@ -367,14 +367,14 @@ public class IKQueryExpressionParser {
this.querys.push(tQuery); this.querys.push(tQuery);
} else { } else {
String keyword = e3.toString(); String keyword = e3.toString();
//SWMCQuery Here // SWMCQuery Here
Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword); Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword);
this.querys.push(_SWMCQuery); this.querys.push(_SWMCQuery);
} }
} else if ('[' == e3.type || '{' == e3.type) { } else if ('[' == e3.type || '{' == e3.type) {
i += 2; i += 2;
//处理 [] {} // 处理 [] {}
LinkedList<Element> eQueue = new LinkedList<>(); LinkedList<Element> eQueue = new LinkedList<>();
eQueue.add(e3); eQueue.add(e3);
for (i++; i < this.elements.size(); i++) { for (i++; i < this.elements.size(); i++) {
@ -384,7 +384,7 @@ public class IKQueryExpressionParser {
break; break;
} }
} }
//翻译RangeQuery // 翻译RangeQuery
Query rangeQuery = this.toTermRangeQuery(e, eQueue); Query rangeQuery = this.toTermRangeQuery(e, eQueue);
this.querys.push(rangeQuery); this.querys.push(rangeQuery);
} else { } else {
@ -475,10 +475,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
//q1 instanceof TermQuery // q1 instanceof TermQuery
//q1 instanceof TermRangeQuery // q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery // q1 instanceof PhraseQuery
//others // others
resultQuery.add(q1, Occur.MUST); resultQuery.add(q1, Occur.MUST);
} }
} }
@ -496,10 +496,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
//q1 instanceof TermQuery // q1 instanceof TermQuery
//q1 instanceof TermRangeQuery // q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery // q1 instanceof PhraseQuery
//others // others
resultQuery.add(q2, Occur.MUST); resultQuery.add(q2, Occur.MUST);
} }
} }
@ -518,10 +518,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
//q1 instanceof TermQuery // q1 instanceof TermQuery
//q1 instanceof TermRangeQuery // q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery // q1 instanceof PhraseQuery
//others // others
resultQuery.add(q1, Occur.SHOULD); resultQuery.add(q1, Occur.SHOULD);
} }
} }
@ -538,10 +538,10 @@ public class IKQueryExpressionParser {
resultQuery.add(q2, Occur.SHOULD); resultQuery.add(q2, Occur.SHOULD);
} }
} else { } else {
//q2 instanceof TermQuery // q2 instanceof TermQuery
//q2 instanceof TermRangeQuery // q2 instanceof TermRangeQuery
//q2 instanceof PhraseQuery // q2 instanceof PhraseQuery
//others // others
resultQuery.add(q2, Occur.SHOULD); resultQuery.add(q2, Occur.SHOULD);
} }
@ -563,10 +563,10 @@ public class IKQueryExpressionParser {
} }
} else { } else {
//q1 instanceof TermQuery // q1 instanceof TermQuery
//q1 instanceof TermRangeQuery // q1 instanceof TermRangeQuery
//q1 instanceof PhraseQuery // q1 instanceof PhraseQuery
//others // others
resultQuery.add(q1, Occur.MUST); resultQuery.add(q1, Occur.MUST);
} }
@ -584,7 +584,7 @@ public class IKQueryExpressionParser {
boolean includeLast; boolean includeLast;
String firstValue; String firstValue;
String lastValue = null; String lastValue = null;
//检查第一个元素是否是[或者{ // 检查第一个元素是否是[或者{
Element first = elements.getFirst(); Element first = elements.getFirst();
if ('[' == first.type) { if ('[' == first.type) {
includeFirst = true; includeFirst = true;
@ -593,7 +593,7 @@ public class IKQueryExpressionParser {
} else { } else {
throw new IllegalStateException("表达式异常"); throw new IllegalStateException("表达式异常");
} }
//检查最后一个元素是否是]或者} // 检查最后一个元素是否是]或者}
Element last = elements.getLast(); Element last = elements.getLast();
if (']' == last.type) { if (']' == last.type) {
includeLast = true; includeLast = true;
@ -605,7 +605,7 @@ public class IKQueryExpressionParser {
if (elements.size() < 4 || elements.size() > 5) { if (elements.size() < 4 || elements.size() > 5) {
throw new IllegalStateException("表达式异常, RangeQuery 错误"); throw new IllegalStateException("表达式异常, RangeQuery 错误");
} }
//读出中间部分 // 读出中间部分
Element e2 = elements.get(1); Element e2 = elements.get(1);
if ('\'' == e2.type) { if ('\'' == e2.type) {
firstValue = e2.toString(); firstValue = e2.toString();
@ -673,7 +673,7 @@ public class IKQueryExpressionParser {
* @author linliangyi * @author linliangyi
* May 20, 2010 * May 20, 2010
*/ */
private class Element { private static class Element {
char type = 0; char type = 0;
StringBuffer eleTextBuff; StringBuffer eleTextBuff;
@ -692,11 +692,9 @@ public class IKQueryExpressionParser {
public static void main(String[] args) { public static void main(String[] args) {
IKQueryExpressionParser parser = new IKQueryExpressionParser(); IKQueryExpressionParser parser = new IKQueryExpressionParser();
//String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'"; String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
Query result = parser.parseExp(ikQueryExp); Query result = parser.parseExp(ikQueryExp);
System.out.println(result); System.out.println(result);
} }
} }

View File

@ -45,6 +45,7 @@ import java.util.List;
* *
* @author linliangyi * @author linliangyi
*/ */
@SuppressWarnings("unused")
class SWMCQueryBuilder { class SWMCQueryBuilder {
/** /**
@ -56,9 +57,9 @@ class SWMCQueryBuilder {
if (fieldName == null || keywords == null) { if (fieldName == null || keywords == null) {
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
} }
//1.对keywords进行分词处理 // 1.对keywords进行分词处理
List<Lexeme> lexemes = doAnalyze(keywords); List<Lexeme> lexemes = doAnalyze(keywords);
//2.根据分词结果生成SWMCQuery // 2.根据分词结果生成SWMCQuery
return getSWMCQuery(fieldName, lexemes); return getSWMCQuery(fieldName, lexemes);
} }
@ -84,20 +85,20 @@ class SWMCQueryBuilder {
* 根据分词结果生成SWMC搜索 * 根据分词结果生成SWMC搜索
*/ */
private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) { private static Query getSWMCQuery(String fieldName, List<Lexeme> lexemes) {
//构造SWMC的查询表达式 // 构造SWMC的查询表达式
StringBuilder keywordBuffer = new StringBuilder(); StringBuilder keywordBuffer = new StringBuilder();
//精简的SWMC的查询表达式 // 精简的SWMC的查询表达式
StringBuilder keywordBuffer_Short = new StringBuilder(); StringBuilder keywordBuffer_Short = new StringBuilder();
//记录最后词元长度 // 记录最后词元长度
int lastLexemeLength = 0; int lastLexemeLength = 0;
//记录最后词元结束位置 // 记录最后词元结束位置
int lastLexemeEnd = -1; int lastLexemeEnd = -1;
int shortCount = 0; int shortCount = 0;
int totalCount = 0; int totalCount = 0;
for (Lexeme l : lexemes) { for (Lexeme l : lexemes) {
totalCount += l.getLength(); totalCount += l.getLength();
//精简表达式 // 精简表达式
if (l.getLength() > 1) { if (l.getLength() > 1) {
keywordBuffer_Short.append(' ').append(l.getLexemeText()); keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength(); shortCount += l.getLength();
@ -106,7 +107,7 @@ class SWMCQueryBuilder {
if (lastLexemeLength == 0) { if (lastLexemeLength == 0) {
keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
} else if (lastLexemeLength == 1 && l.getLength() == 1 } else if (lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()) {//单字位置相邻长度为一合并) && lastLexemeEnd == l.getBeginPosition()) {// 单字位置相邻长度为一合并)
keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
} else { } else {
keywordBuffer.append(' ').append(l.getLexemeText()); keywordBuffer.append(' ').append(l.getLexemeText());
@ -116,10 +117,10 @@ class SWMCQueryBuilder {
lastLexemeEnd = l.getEndPosition(); lastLexemeEnd = l.getEndPosition();
} }
//借助lucene queryparser 生成SWMC Query // 借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer()); QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
qp.setAutoGeneratePhraseQueries(false);
qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if ((shortCount * 1.0f / totalCount) > 0.5f) { if ((shortCount * 1.0f / totalCount) > 0.5f) {
try { try {