LUCENE-588: Escaped wildcard character in wildcard term not handled correctly

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031765 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-11-05 20:07:20 +00:00
parent 2bbfc85aad
commit a25dc8b9d5
11 changed files with 255 additions and 66 deletions

View File

@ -113,6 +113,12 @@ Changes in backwards compatibility policy
* LUCENE-2674: MultiTermQuery.TermCollector.collect now accepts the
TermsEnum as well. (Robert Muir, Mike McCandless)
* LUCENE-588: WildcardQuery and QueryParser now allows escaping with
the '\' character. Previously this was impossible (you could not escape */?,
for example). If your code somehow depends on the old behavior, you will
need to change it (e.g. using "\\" to escape '\' itself).
(Sunil Kamath, Terry Yang via Robert Muir)
Changes in Runtime Behavior
* LUCENE-2650: The behavior of FSDirectory.open has changed. On 64-bit

View File

@ -49,9 +49,9 @@ public class WildcardQueryNode extends FieldQueryNode {
@Override
public CharSequence toQueryString(EscapeQuerySyntax escaper) {
if (isDefaultField(this.field)) {
return getTermEscaped(escaper);
return this.text;
} else {
return this.field + ":" + getTermEscaped(escaper);
return this.field + ":" + this.text;
}
}

View File

@ -76,6 +76,7 @@ import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.junit.Ignore;
/**
* This test case is a copy of the core Lucene query parser test, it was adapted
@ -945,6 +946,15 @@ public class TestQPHelper extends LuceneTestCase {
assertEscapedQueryEquals("&& abc &&", a, "\\&\\& abc \\&\\&");
}
@Ignore("contrib queryparser shouldn't escape wildcard terms")
public void testEscapedWildcard() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(new MockAnalyzer(MockTokenizer.WHITESPACE, false));
WildcardQuery q = new WildcardQuery(new Term("field", "foo\\?ba?r"));
assertEquals(q, qp.parse("foo\\?ba?r", "field"));
}
public void testTabNewlineCarriageReturn() throws Exception {
assertQueryEqualsDOA("+weltbank +worlbank", null, "+weltbank +worlbank");

View File

@ -33,7 +33,11 @@ import org.apache.lucene.util.StringHelper;
import java.io.IOException;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.SortedMap;
import java.util.Iterator;
class SimpleTextFieldsReader extends FieldsProducer {
@ -78,7 +82,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
private class SimpleTextFieldsEnum extends FieldsEnum {
private final IndexInput in;
private final BytesRef scratch = new BytesRef(10);
private boolean omitTF;
private String current;
public SimpleTextFieldsEnum() {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
@ -89,11 +93,12 @@ class SimpleTextFieldsReader extends FieldsProducer {
while(true) {
readLine(in, scratch);
if (scratch.equals(END)) {
current = null;
return null;
}
if (scratch.startsWith(FIELD)) {
String field = StringHelper.intern(new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8"));
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
current = field;
return field;
}
}
@ -101,7 +106,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override
public TermsEnum terms() throws IOException {
return new SimpleTextTermsEnum(in.getFilePointer(), omitTF);
return SimpleTextFieldsReader.this.terms(current).iterator();
}
}
@ -109,21 +114,42 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final IndexInput in;
private final boolean omitTF;
private BytesRef current;
private final long fieldStart;
private final BytesRef scratch = new BytesRef(10);
private final BytesRef scratch2 = new BytesRef(10);
private int docFreq;
private long docsStart;
private boolean ended;
private final TreeMap<BytesRef,TermData> allTerms;
private Iterator<Map.Entry<BytesRef,TermData>> iter;
public SimpleTextTermsEnum(long offset, boolean omitTF) throws IOException {
public SimpleTextTermsEnum(TreeMap<BytesRef,TermData> allTerms, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.in.seek(offset);
this.allTerms = allTerms;
this.omitTF = omitTF;
fieldStart = offset;
iter = allTerms.entrySet().iterator();
}
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
final SortedMap<BytesRef,TermData> tailMap = allTerms.tailMap(text);
if (tailMap.isEmpty()) {
current = null;
return SeekStatus.END;
} else {
current = tailMap.firstKey();
final TermData td = tailMap.get(current);
docsStart = td.docsStart;
docFreq = td.docFreq;
iter = tailMap.entrySet().iterator();
assert iter.hasNext();
iter.next();
if (current.equals(text)) {
return SeekStatus.FOUND;
} else {
return SeekStatus.NOT_FOUND;
}
}
/*
if (current != null) {
final int cmp = current.compareTo(text);
if (cmp == 0) {
@ -153,6 +179,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
current = null;
ended = true;
return SeekStatus.END;
*/
}
@Override
@ -162,6 +189,20 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override
public BytesRef next() throws IOException {
assert !ended;
if (iter.hasNext()) {
Map.Entry<BytesRef,TermData> ent = iter.next();
current = ent.getKey();
TermData td = ent.getValue();
docFreq = td.docFreq;
docsStart = td.docsStart;
return current;
} else {
current = null;
return null;
}
/*
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
ended = true;
@ -192,6 +233,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
in.seek(lineStart);
return current;
}
*/
}
@Override
@ -447,20 +489,70 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
}
static class TermData {
public long docsStart;
public int docFreq;
public TermData(long docsStart, int docFreq) {
this.docsStart = docsStart;
this.docFreq = docFreq;
}
}
private class SimpleTextTerms extends Terms {
private final String field;
private final long termsStart;
private final boolean omitTF;
public SimpleTextTerms(String field, long termsStart) {
// NOTE: horribly, horribly RAM consuming, but then
// SimpleText should never be used in production
private final TreeMap<BytesRef,TermData> allTerms = new TreeMap<BytesRef,TermData>();
private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException {
this.field = StringHelper.intern(field);
this.termsStart = termsStart;
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
loadTerms();
}
private void loadTerms() throws IOException {
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1;
int docFreq = 0;
while(true) {
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) {
allTerms.put(new BytesRef(lastTerm),
new TermData(lastDocsStart, docFreq));
}
break;
} else if (scratch.startsWith(DOC)) {
docFreq++;
} else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) {
allTerms.put(new BytesRef(lastTerm),
new TermData(lastDocsStart, docFreq));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
if (len > lastTerm.length) {
lastTerm.grow(len);
}
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
lastTerm.length = len;
docFreq = 0;
}
}
}
@Override
public TermsEnum iterator() throws IOException {
return new SimpleTextTermsEnum(termsStart, omitTF);
return new SimpleTextTermsEnum(allTerms, omitTF);
}
@Override

View File

@ -1045,7 +1045,7 @@ public abstract class QueryParserBase {
String termImage=discardEscapeChar(term.image);
if (wildcard) {
q = getWildcardQuery(qfield, termImage);
q = getWildcardQuery(qfield, term.image);
} else if (prefix) {
q = getPrefixQuery(qfield,
discardEscapeChar(term.image.substring

View File

@ -45,6 +45,9 @@ public class WildcardQuery extends AutomatonQuery {
/** Char equality with support for wildcards */
public static final char WILDCARD_CHAR = '?';
/** Escape character */
public static final char WILDCARD_ESCAPE = '\\';
/**
* Constructs a query for terms matching <code>term</code>.
*/
@ -56,6 +59,7 @@ public class WildcardQuery extends AutomatonQuery {
* Convert Lucene wildcard syntax into an automaton.
* @lucene.internal
*/
@SuppressWarnings("fallthrough")
public static Automaton toAutomaton(Term wildcardquery) {
List<Automaton> automata = new ArrayList<Automaton>();
@ -63,6 +67,7 @@ public class WildcardQuery extends AutomatonQuery {
for (int i = 0; i < wildcardText.length();) {
final int c = wildcardText.codePointAt(i);
int length = Character.charCount(c);
switch(c) {
case WILDCARD_STRING:
automata.add(BasicAutomata.makeAnyString());
@ -70,10 +75,18 @@ public class WildcardQuery extends AutomatonQuery {
case WILDCARD_CHAR:
automata.add(BasicAutomata.makeAnyChar());
break;
case WILDCARD_ESCAPE:
// add the next codepoint instead, if it exists
if (i + length < wildcardText.length()) {
final int nextChar = wildcardText.codePointAt(i + length);
length += Character.charCount(nextChar);
automata.add(BasicAutomata.makeChar(nextChar));
break;
} // else fallthru, lenient parsing with a trailing \
default:
automata.add(BasicAutomata.makeChar(c));
}
i += Character.charCount(c);
i += length;
}
return BasicOperations.concatenate(automata);

View File

@ -178,7 +178,7 @@ final public class SpecialOperations {
* Reverses the language of the given (non-singleton) automaton while returning
* the set of new initial states.
*/
static Set<State> reverse(Automaton a) {
public static Set<State> reverse(Automaton a) {
a.expandSingleton();
// reverse all edges
HashMap<State, HashSet<Transition>> m = new HashMap<State, HashSet<Transition>>();

View File

@ -770,11 +770,11 @@ public class TestQueryParser extends LuceneTestCase {
assertQueryEquals("a:b\\\\c*", a, "a:b\\c*");
assertQueryEquals("a:b\\-?c", a, "a:b-?c");
assertQueryEquals("a:b\\+?c", a, "a:b+?c");
assertQueryEquals("a:b\\:?c", a, "a:b:?c");
assertQueryEquals("a:b\\-?c", a, "a:b\\-?c");
assertQueryEquals("a:b\\+?c", a, "a:b\\+?c");
assertQueryEquals("a:b\\:?c", a, "a:b\\:?c");
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c");
assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0");
assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0");
@ -1062,6 +1062,12 @@ public class TestQueryParser extends LuceneTestCase {
}
public void testEscapedWildcard() throws Exception {
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockTokenizer.WHITESPACE, false));
WildcardQuery q = new WildcardQuery(new Term("field", "foo\\?ba?r"));
assertEquals(q, qp.parse("foo\\?ba?r"));
}
public void testRegexps() throws Exception {
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockTokenizer.WHITESPACE, false));
RegexpQuery q = new RegexpQuery(new Term("field", "[a-z][123]"));

View File

@ -205,6 +205,38 @@ public class TestWildcard
indexStore.close();
}
/**
* Tests if wildcard escaping works
*/
public void testEscapes() throws Exception {
Directory indexStore = getIndexStore("field",
new String[]{"foo*bar", "foo??bar", "fooCDbar", "fooSOMETHINGbar", "foo\\"});
IndexSearcher searcher = new IndexSearcher(indexStore, true);
// without escape: matches foo??bar, fooCDbar, foo*bar, and fooSOMETHINGbar
WildcardQuery unescaped = new WildcardQuery(new Term("field", "foo*bar"));
assertMatches(searcher, unescaped, 4);
// with escape: only matches foo*bar
WildcardQuery escaped = new WildcardQuery(new Term("field", "foo\\*bar"));
assertMatches(searcher, escaped, 1);
// without escape: matches foo??bar and fooCDbar
unescaped = new WildcardQuery(new Term("field", "foo??bar"));
assertMatches(searcher, unescaped, 2);
// with escape: matches foo??bar only
escaped = new WildcardQuery(new Term("field", "foo\\?\\?bar"));
assertMatches(searcher, escaped, 1);
// check escaping at end: lenient parse yields "foo\"
WildcardQuery atEnd = new WildcardQuery(new Term("field", "foo\\"));
assertMatches(searcher, atEnd, 1);
searcher.close();
indexStore.close();
}
private Directory getIndexStore(String field, String[] contents)
throws IOException {
Directory indexStore = newDirectory();

View File

@ -30,6 +30,7 @@ import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.analysis.Analyzer;
import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
@ -202,37 +203,36 @@ public class SolrQueryParser extends QueryParser {
String type = schema.getFieldType(field).getTypeName();
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
if (factory != null) {
Term term = new Term(field, termStr);
// fsa representing the query
Automaton automaton = WildcardQuery.toAutomaton(term);
// TODO: we should likely use the automaton to calculate shouldReverse, too.
if (factory.shouldReverse(termStr)) {
int len = termStr.length();
char[] chars = new char[len+1];
chars[0] = factory.getMarkerChar();
termStr.getChars(0, len, chars, 1);
ReversedWildcardFilter.reverse(chars, 1, len);
termStr = new String(chars);
automaton = BasicOperations.concatenate(automaton, BasicAutomata.makeChar(factory.getMarkerChar()));
SpecialOperations.reverse(automaton);
} else {
// reverse wildcardfilter is active: remove false positives
Term term = new Term(field, termStr);
// fsa representing the query
Automaton a = WildcardQuery.toAutomaton(term);
// fsa representing false positives (markerChar*)
Automaton falsePositives = BasicOperations.concatenate(
BasicAutomata.makeChar(factory.getMarkerChar()),
BasicAutomata.makeAnyString());
return new AutomatonQuery(term, BasicOperations.minus(a, falsePositives)) {
// override toString so its completely transparent
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(term.text());
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
};
// subtract these away
automaton = BasicOperations.minus(automaton, falsePositives);
}
return new AutomatonQuery(term, automaton) {
// override toString so its completely transparent
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(term.text());
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
};
}
Query q = super.getWildcardQuery(field, termStr);
if (q instanceof WildcardQuery) {

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.Field;
import java.util.HashMap;
import java.util.Map;
@ -26,8 +27,10 @@ import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrQueryParser;
@ -51,6 +54,8 @@ public class TestReversedWildcardFilterFactory extends SolrTestCaseJ4 {
public void setUp() throws Exception {
super.setUp();
schema = new IndexSchema(solrConfig, getSchemaFile(), null);
clearIndex();
assertU(commit());
}
@Test
@ -105,7 +110,7 @@ public class TestReversedWildcardFilterFactory extends SolrTestCaseJ4 {
}
@Test
public void testQueryParsing() throws IOException, ParseException {
public void testQueryParsing() throws Exception {
SolrQueryParser parserOne = new SolrQueryParser(schema, "one");
assertTrue(parserOne.getAllowLeadingWildcard());
@ -115,28 +120,53 @@ public class TestReversedWildcardFilterFactory extends SolrTestCaseJ4 {
// XXX note: this should be false, but for now we return true for any field,
// XXX if at least one field uses the reversing
assertTrue(parserThree.getAllowLeadingWildcard());
String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex";
String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*";
String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*";
String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex";
Query q = parserOne.parse(text);
assertEquals(expectedOne, q.toString());
q = parserTwo.parse(text);
assertEquals(expectedTwo, q.toString());
q = parserThree.parse(text);
assertEquals(expectedThree, q.toString());
// add some docs
assertU(adoc("id", "1", "one", "one"));
assertU(adoc("id", "2", "two", "two"));
assertU(adoc("id", "3", "three", "three"));
assertU(adoc("id", "4", "one", "four"));
assertU(adoc("id", "5", "two", "five"));
assertU(adoc("id", "6", "three", "si\uD834\uDD1Ex"));
assertU(commit());
assertQ("should have matched",
req("+id:1 +one:one"),
"//result[@numFound=1]");
assertQ("should have matched",
req("+id:4 +one:f*ur"),
"//result[@numFound=1]");
assertQ("should have matched",
req("+id:6 +three:*si\uD834\uDD1Ex"),
"//result[@numFound=1]");
// test conditional reversal
String condText = "*hree t*ree th*ee thr*e ?hree t?ree th?ee th?*ee " +
"short*token ver*longtoken";
String expected = "two:\u0001eerh* two:\u0001eer*t two:\u0001ee*ht " +
"two:thr*e " +
"two:\u0001eerh? two:\u0001eer?t " +
"two:th?ee " +
"two:th?*ee " +
"two:short*token " +
"two:\u0001nekotgnol*rev";
q = parserTwo.parse(condText);
assertEquals(expected, q.toString());
assertTrue(wasReversed(parserTwo, "*hree"));
assertTrue(wasReversed(parserTwo, "t*ree"));
assertTrue(wasReversed(parserTwo, "th*ee"));
assertFalse(wasReversed(parserTwo, "thr*e"));
assertTrue(wasReversed(parserTwo, "?hree"));
assertTrue(wasReversed(parserTwo, "t?ree"));
assertFalse(wasReversed(parserTwo, "th?ee"));
assertFalse(wasReversed(parserTwo, "th?*ee"));
assertFalse(wasReversed(parserTwo, "short*token"));
assertTrue(wasReversed(parserTwo, "ver*longtoken"));
}
/** fragile assert: depends on our implementation, but cleanest way to check for now */
private boolean wasReversed(SolrQueryParser qp, String query) throws Exception {
Query q = qp.parse(query);
if (!(q instanceof AutomatonQuery))
return false;
// this is a hack to get the protected Automaton field in AutomatonQuery,
// may break in later lucene versions - we have no getter... for good reasons.
final Field automatonField = AutomatonQuery.class.getDeclaredField("automaton");
automatonField.setAccessible(true);
Automaton automaton = (Automaton) automatonField.get(q);
String prefix = SpecialOperations.getCommonPrefix(automaton);
return prefix.length() > 0 && prefix.charAt(0) == '\u0001';
}
@Test