LUCENE-3981: fix regex queryparsing issues (improperly recognized as wildcard, etc)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1326893 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-04-17 02:51:43 +00:00
parent a4cbd8a605
commit 167b296353
14 changed files with 128 additions and 83 deletions

View File

@ -137,7 +137,7 @@ PARSER_END(QueryParser)
// every character that follows a backslash is considered as an escaped character
| <#_ESCAPED_CHAR: "\\" ~[] >
| <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^",
"[", "]", "\"", "{", "}", "~", "*", "?", "\\" ]
"[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ]
| <_ESCAPED_CHAR> ) >
| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) >
| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") >

View File

@ -103,7 +103,7 @@ private int jjMoveNfa_2(int startState, int curPos)
switch(jjstateSet[--i])
{
case 0:
if ((0xfbffd4f8ffffd9ffL & l) != 0L)
if ((0xfbff54f8ffffd9ffL & l) != 0L)
{
if (kind > 23)
kind = 23;
@ -116,13 +116,15 @@ private int jjMoveNfa_2(int startState, int curPos)
}
else if ((0x280200000000L & l) != 0L)
jjstateSet[jjnewStateCnt++] = 15;
else if (curChar == 34)
else if (curChar == 47)
jjCheckNAddStates(0, 2);
if ((0x7bffd0f8ffffd9ffL & l) != 0L)
else if (curChar == 34)
jjCheckNAddStates(3, 5);
if ((0x7bff50f8ffffd9ffL & l) != 0L)
{
if (kind > 20)
kind = 20;
jjCheckNAddStates(3, 7);
jjCheckNAddStates(6, 10);
}
else if (curChar == 42)
{
@ -134,14 +136,12 @@ private int jjMoveNfa_2(int startState, int curPos)
if (kind > 10)
kind = 10;
}
if (curChar == 47)
jjCheckNAddStates(8, 10);
else if (curChar == 38)
if (curChar == 38)
jjstateSet[jjnewStateCnt++] = 4;
break;
case 43:
case 27:
if ((0xfbfffcf8ffffd9ffL & l) == 0L)
if ((0xfbff7cf8ffffd9ffL & l) == 0L)
break;
if (kind > 23)
kind = 23;
@ -169,14 +169,14 @@ private int jjMoveNfa_2(int startState, int curPos)
break;
case 16:
if (curChar == 34)
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 17:
if ((0xfffffffbffffffffL & l) != 0L)
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 19:
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 20:
if (curChar == 34 && kind > 19)
@ -205,7 +205,7 @@ private int jjMoveNfa_2(int startState, int curPos)
kind = 22;
break;
case 26:
if ((0xfbffd4f8ffffd9ffL & l) == 0L)
if ((0xfbff54f8ffffd9ffL & l) == 0L)
break;
if (kind > 23)
kind = 23;
@ -219,25 +219,25 @@ private int jjMoveNfa_2(int startState, int curPos)
case 30:
case 32:
if (curChar == 47)
jjCheckNAddStates(8, 10);
jjCheckNAddStates(0, 2);
break;
case 31:
if ((0xffff7fffffffffffL & l) != 0L)
jjCheckNAddStates(8, 10);
jjCheckNAddStates(0, 2);
break;
case 34:
if (curChar == 47 && kind > 24)
kind = 24;
break;
case 35:
if ((0x7bffd0f8ffffd9ffL & l) == 0L)
if ((0x7bff50f8ffffd9ffL & l) == 0L)
break;
if (kind > 20)
kind = 20;
jjCheckNAddStates(3, 7);
jjCheckNAddStates(6, 10);
break;
case 36:
if ((0x7bfff8f8ffffd9ffL & l) == 0L)
if ((0x7bff78f8ffffd9ffL & l) == 0L)
break;
if (kind > 20)
kind = 20;
@ -249,7 +249,7 @@ private int jjMoveNfa_2(int startState, int curPos)
jjCheckNAddTwoStates(36, 37);
break;
case 39:
if ((0x7bfff8f8ffffd9ffL & l) != 0L)
if ((0x7bff78f8ffffd9ffL & l) != 0L)
jjCheckNAddStates(13, 15);
break;
case 41:
@ -271,7 +271,7 @@ private int jjMoveNfa_2(int startState, int curPos)
{
if (kind > 20)
kind = 20;
jjCheckNAddStates(3, 7);
jjCheckNAddStates(6, 10);
}
else if (curChar == 92)
jjCheckNAddStates(16, 18);
@ -348,14 +348,14 @@ private int jjMoveNfa_2(int startState, int curPos)
break;
case 17:
if ((0xffffffffefffffffL & l) != 0L)
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 18:
if (curChar == 92)
jjstateSet[jjnewStateCnt++] = 19;
break;
case 19:
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 21:
if (curChar != 126)
@ -388,7 +388,7 @@ private int jjMoveNfa_2(int startState, int curPos)
jjCheckNAddTwoStates(27, 28);
break;
case 31:
jjAddStates(8, 10);
jjAddStates(0, 2);
break;
case 33:
if (curChar == 92)
@ -399,7 +399,7 @@ private int jjMoveNfa_2(int startState, int curPos)
break;
if (kind > 20)
kind = 20;
jjCheckNAddStates(3, 7);
jjCheckNAddStates(6, 10);
break;
case 36:
if ((0x97ffffff87ffffffL & l) == 0L)
@ -438,7 +438,7 @@ private int jjMoveNfa_2(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
@ -463,7 +463,7 @@ private int jjMoveNfa_2(int startState, int curPos)
{
if (kind > 20)
kind = 20;
jjCheckNAddStates(3, 7);
jjCheckNAddStates(6, 10);
}
break;
case 43:
@ -481,7 +481,7 @@ private int jjMoveNfa_2(int startState, int curPos)
case 17:
case 19:
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 26:
if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
@ -499,14 +499,14 @@ private int jjMoveNfa_2(int startState, int curPos)
break;
case 31:
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
jjAddStates(8, 10);
jjAddStates(0, 2);
break;
case 35:
if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
break;
if (kind > 20)
kind = 20;
jjCheckNAddStates(3, 7);
jjCheckNAddStates(6, 10);
break;
case 36:
if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
@ -604,7 +604,7 @@ private int jjMoveNfa_0(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
@ -777,7 +777,7 @@ private int jjMoveNfa_1(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
@ -828,7 +828,7 @@ private int jjMoveNfa_1(int startState, int curPos)
}
}
static final int[] jjnextStates = {
17, 18, 20, 36, 39, 25, 40, 37, 31, 33, 34, 22, 23, 39, 25, 40,
31, 33, 34, 17, 18, 20, 36, 39, 25, 40, 37, 22, 23, 39, 25, 40,
38, 41, 29, 0, 1, 2, 4, 5,
};
private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)

View File

@ -33,7 +33,7 @@ public class EscapeQuerySyntaxImpl implements EscapeQuerySyntax {
private static final String[] escapableTermExtraFirstChars = { "+", "-", "@" };
private static final String[] escapableTermChars = { "\"", "<", ">", "=",
"!", "(", ")", "^", "[", "{", ":", "]", "}", "~" };
"!", "(", ")", "^", "[", "{", ":", "]", "}", "~", "/" };
// TODO: check what to do with these "*", "?", "\\"
private static final String[] escapableQuotedChars = { "\"" };

View File

@ -613,4 +613,4 @@ public class JavaCharStream
}
}
/* JavaCC - OriginalChecksum=a050c1d21b27b6d9eed401dd428aa609 (do not edit this line) */
/* JavaCC - OriginalChecksum=7eecaeeaea1254b3e35fe8890a0127ce (do not edit this line) */

View File

@ -193,4 +193,4 @@ public class ParseException extends QueryNodeParseException {
}
}
/* JavaCC - OriginalChecksum=7601d49d11bc059457ae5850628ebc8a (do not edit this line) */
/* JavaCC - OriginalChecksum=0f25f4245374bbf9920c9a82efecadd2 (do not edit this line) */

View File

@ -572,7 +572,8 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
}
q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
} else if (regexp) {
q = new RegexpQueryNode(field, term.image, term.beginColumn, term.endColumn-1);
String re = term.image.substring(1, term.image.length()-1);
q = new RegexpQueryNode(field, re, 0, re.length());
}
break;
case RANGEIN_START:
@ -763,21 +764,31 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
return false;
}
private boolean jj_3R_10() {
if (jj_scan_token(TERM)) return true;
return false;
}
private boolean jj_3R_12() {
if (jj_scan_token(RANGEIN_START)) return true;
return false;
}
private boolean jj_3R_10() {
if (jj_scan_token(TERM)) return true;
return false;
}
private boolean jj_3R_11() {
if (jj_scan_token(REGEXPTERM)) return true;
return false;
}
private boolean jj_3R_8() {
Token xsp;
xsp = jj_scanpos;
if (jj_3R_12()) {
jj_scanpos = xsp;
if (jj_scan_token(27)) return true;
}
return false;
}
private boolean jj_3_1() {
if (jj_scan_token(TERM)) return true;
Token xsp;
@ -789,13 +800,8 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
return false;
}
private boolean jj_3R_8() {
Token xsp;
xsp = jj_scanpos;
if (jj_3R_12()) {
jj_scanpos = xsp;
if (jj_scan_token(27)) return true;
}
private boolean jj_3R_9() {
if (jj_scan_token(QUOTED)) return true;
return false;
}
@ -836,11 +842,6 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
return false;
}
private boolean jj_3R_9() {
if (jj_scan_token(QUOTED)) return true;
return false;
}
/** Generated Token Manager. */
public StandardSyntaxParserTokenManager token_source;
JavaCharStream jj_input_stream;

View File

@ -100,7 +100,7 @@ PARSER_END(StandardSyntaxParser)
// every character that follows a backslash is considered as an escaped character
| <#_ESCAPED_CHAR: "\\" ~[] >
| <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^",
"<", ">", "=", "[", "]", "\"", "{", "}", "~", "\\" ]
"<", ">", "=", "[", "]", "\"", "{", "}", "~", "\\", "/" ]
| <_ESCAPED_CHAR> ) >
| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) >
| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") >
@ -449,7 +449,8 @@ QueryNode Term(CharSequence field) : {
}
q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
} else if (regexp) {
q = new RegexpQueryNode(field, term.image, term.beginColumn, term.endColumn-1);
String re = term.image.substring(1, term.image.length()-1);
q = new RegexpQueryNode(field, re, 0, re.length());
}
}
| ( ( <RANGEIN_START> {startInc=true;} | <RANGEEX_START> )

View File

@ -147,7 +147,7 @@ private int jjMoveNfa_2(int startState, int curPos)
switch(jjstateSet[--i])
{
case 0:
if ((0x8bffd4f8ffffd9ffL & l) != 0L)
if ((0x8bff54f8ffffd9ffL & l) != 0L)
{
if (kind > 23)
kind = 23;
@ -158,16 +158,16 @@ private int jjMoveNfa_2(int startState, int curPos)
if (kind > 7)
kind = 7;
}
else if (curChar == 34)
else if (curChar == 47)
jjCheckNAddStates(0, 2);
else if (curChar == 34)
jjCheckNAddStates(3, 5);
else if (curChar == 33)
{
if (kind > 10)
kind = 10;
}
if (curChar == 47)
jjCheckNAddStates(3, 5);
else if (curChar == 38)
if (curChar == 38)
jjstateSet[jjnewStateCnt++] = 4;
break;
case 4:
@ -184,28 +184,28 @@ private int jjMoveNfa_2(int startState, int curPos)
break;
case 14:
if (curChar == 34)
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 15:
if ((0xfffffffbffffffffL & l) != 0L)
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 17:
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 18:
if (curChar == 34 && kind > 22)
kind = 22;
break;
case 19:
if ((0x8bffd4f8ffffd9ffL & l) == 0L)
if ((0x8bff54f8ffffd9ffL & l) == 0L)
break;
if (kind > 23)
kind = 23;
jjCheckNAddTwoStates(20, 21);
break;
case 20:
if ((0x8bfffcf8ffffd9ffL & l) == 0L)
if ((0x8bff7cf8ffffd9ffL & l) == 0L)
break;
if (kind > 23)
kind = 23;
@ -237,11 +237,11 @@ private int jjMoveNfa_2(int startState, int curPos)
case 28:
case 30:
if (curChar == 47)
jjCheckNAddStates(3, 5);
jjCheckNAddStates(0, 2);
break;
case 29:
if ((0xffff7fffffffffffL & l) != 0L)
jjCheckNAddStates(3, 5);
jjCheckNAddStates(0, 2);
break;
case 32:
if (curChar == 47 && kind > 25)
@ -324,14 +324,14 @@ private int jjMoveNfa_2(int startState, int curPos)
break;
case 15:
if ((0xffffffffefffffffL & l) != 0L)
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 16:
if (curChar == 92)
jjstateSet[jjnewStateCnt++] = 17;
break;
case 17:
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 19:
case 20:
@ -362,7 +362,7 @@ private int jjMoveNfa_2(int startState, int curPos)
jjstateSet[jjnewStateCnt++] = 25;
break;
case 29:
jjAddStates(3, 5);
jjAddStates(0, 2);
break;
case 31:
if (curChar == 92)
@ -374,7 +374,7 @@ private int jjMoveNfa_2(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
@ -399,7 +399,7 @@ private int jjMoveNfa_2(int startState, int curPos)
case 15:
case 17:
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
jjCheckNAddStates(0, 2);
jjCheckNAddStates(3, 5);
break;
case 19:
case 20:
@ -418,7 +418,7 @@ private int jjMoveNfa_2(int startState, int curPos)
break;
case 29:
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
jjAddStates(3, 5);
jjAddStates(0, 2);
break;
default : break;
}
@ -494,7 +494,7 @@ private int jjMoveNfa_0(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
@ -667,7 +667,7 @@ private int jjMoveNfa_1(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
@ -718,7 +718,7 @@ private int jjMoveNfa_1(int startState, int curPos)
}
}
static final int[] jjnextStates = {
15, 16, 18, 29, 31, 32, 25, 26, 0, 1, 2, 4, 5,
29, 31, 32, 15, 16, 18, 25, 26, 0, 1, 2, 4, 5,
};
private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
{

View File

@ -121,4 +121,4 @@ public class Token {
}
}
/* JavaCC - OriginalChecksum=3b4fe6dcfcfa24a81f1c6ceffae5f73a (do not edit this line) */
/* JavaCC - OriginalChecksum=e9c55091ec11152bcd3a300ddff5c73a (do not edit this line) */

View File

@ -138,4 +138,4 @@ public class TokenMgrError extends Error
this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
}
}
/* JavaCC - OriginalChecksum=1efb3d906925f2478637c66473b79bae (do not edit this line) */
/* JavaCC - OriginalChecksum=76b513fd9c50f65248056bbeeff49277 (do not edit this line) */

View File

@ -348,7 +348,7 @@ private int jjMoveNfa_1(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
@ -468,7 +468,7 @@ private int jjMoveNfa_0(int startState, int curPos)
}
else
{
int hiByte = (curChar >> 8);
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;

View File

@ -77,6 +77,7 @@ import org.junit.Ignore;
*
* Tests QueryParser.
*/
// TODO: really this should extend QueryParserTestBase too!
public class TestQPHelper extends LuceneTestCase {
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
@ -1139,7 +1140,28 @@ public class TestQPHelper extends LuceneTestCase {
complex.add(new RegexpQuery(new Term("field", "[a-z]\\/[123]")), Occur.MUST);
complex.add(new TermQuery(new Term("path", "/etc/init.d/")), Occur.MUST);
complex.add(new TermQuery(new Term("field", "/etc/init[.]d/lucene/")), Occur.SHOULD);
assertEquals(complex, qp.parse("/[a-z]\\/[123]/ AND path:/etc/init.d/ OR /etc\\/init\\[.\\]d/lucene/ ", df));
assertEquals(complex, qp.parse("/[a-z]\\/[123]/ AND path:\"/etc/init.d/\" OR \"/etc\\/init\\[.\\]d/lucene/\" ", df));
Query re = new RegexpQuery(new Term("field", "http.*"));
assertEquals(re, qp.parse("field:/http.*/", df));
assertEquals(re, qp.parse("/http.*/", df));
re = new RegexpQuery(new Term("field", "http~0.5"));
assertEquals(re, qp.parse("field:/http~0.5/", df));
assertEquals(re, qp.parse("/http~0.5/", df));
re = new RegexpQuery(new Term("field", "boo"));
assertEquals(re, qp.parse("field:/boo/", df));
assertEquals(re, qp.parse("/boo/", df));
assertEquals(new TermQuery(new Term("field", "/boo/")), qp.parse("\"/boo/\"", df));
assertEquals(new TermQuery(new Term("field", "/boo/")), qp.parse("\\/boo\\/", df));
BooleanQuery two = new BooleanQuery();
two.add(new RegexpQuery(new Term("field", "foo")), Occur.SHOULD);
two.add(new RegexpQuery(new Term("field", "bar")), Occur.SHOULD);
assertEquals(two, qp.parse("field:/foo/ field:/bar/", df));
assertEquals(two, qp.parse("/foo/ /bar/", df));
}
public void testStopwords() throws Exception {

View File

@ -1020,7 +1020,28 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
complex.add(new RegexpQuery(new Term("field", "[a-z]\\/[123]")), Occur.MUST);
complex.add(new TermQuery(new Term("path", "/etc/init.d/")), Occur.MUST);
complex.add(new TermQuery(new Term("field", "/etc/init[.]d/lucene/")), Occur.SHOULD);
assertEquals(complex, qp.parse("/[a-z]\\/[123]/ AND path:/etc/init.d/ OR /etc\\/init\\[.\\]d/lucene/ "));
assertEquals(complex, qp.parse("/[a-z]\\/[123]/ AND path:\"/etc/init.d/\" OR \"/etc\\/init\\[.\\]d/lucene/\" "));
Query re = new RegexpQuery(new Term("field", "http.*"));
assertEquals(re, qp.parse("field:/http.*/"));
assertEquals(re, qp.parse("/http.*/"));
re = new RegexpQuery(new Term("field", "http~0.5"));
assertEquals(re, qp.parse("field:/http~0.5/"));
assertEquals(re, qp.parse("/http~0.5/"));
re = new RegexpQuery(new Term("field", "boo"));
assertEquals(re, qp.parse("field:/boo/"));
assertEquals(re, qp.parse("/boo/"));
assertEquals(new TermQuery(new Term("field", "/boo/")), qp.parse("\"/boo/\""));
assertEquals(new TermQuery(new Term("field", "/boo/")), qp.parse("\\/boo\\/"));
BooleanQuery two = new BooleanQuery();
two.add(new RegexpQuery(new Term("field", "foo")), Occur.SHOULD);
two.add(new RegexpQuery(new Term("field", "bar")), Occur.SHOULD);
assertEquals(two, qp.parse("field:/foo/ field:/bar/"));
assertEquals(two, qp.parse("/foo/ /bar/"));
}
public void testStopwords() throws Exception {

View File

@ -180,7 +180,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
"commit", "true" // test immediate commit
);
assertQ(req("id:simple2"), "//*[@numFound='1']");
assertQ(req("defaultExtr:http\\://www.apache.org"), "//*[@numFound='1']");
assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"), "//*[@numFound='1']");
//Test when both uprefix and default are specified.
loadLocal("extraction/simple.html",