LUCENE-7717: UnifiedHighlighter and PostingsHighlighter bug in PrefixQuery and TermRangeQuery for multi-byte text

This commit is contained in:
David Smiley 2017-03-01 01:38:54 -05:00
parent 0baf2fa33c
commit ec13032a94
4 changed files with 49 additions and 25 deletions

View File

@ -257,6 +257,10 @@ Bug Fixes
* LUCENE-7676: Fixed FilterCodecReader to override more super-class methods.
Also added TestFilterCodecReader class. (Christine Poerschke)
* LUCENE-7717: The UnifiedHighlighter and PostingsHighlighter were not highlighting
prefix queries with multi-byte characters. TermRangeQuery is affected too.
(Dmitry Malinin, David Smiley)
======================= Lucene 6.4.1 =======================
Build

View File

@ -87,16 +87,6 @@ class MultiTermHighlighting {
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
} else if (query instanceof SpanMultiTermQueryWrapper) {
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (aq.getField().equals(field)) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
}
} else if (query instanceof PrefixQuery) {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
@ -182,6 +172,16 @@ class MultiTermHighlighting {
}
});
}
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (aq.getField().equals(field)) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
}
}
return list.toArray(new CharacterRunAutomaton[list.size()]);
}

View File

@ -100,16 +100,6 @@ class MultiTermHighlighting {
} else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(),
fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (fieldMatcher.test(aq.getField())) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
}
} else if (query instanceof PrefixQuery) {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
@ -197,6 +187,16 @@ class MultiTermHighlighting {
}
});
}
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (fieldMatcher.test(aq.getField())) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
}
}
return list.toArray(new CharacterRunAutomaton[list.size()]);
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -668,10 +669,11 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
// use a variety of common MTQ types
BooleanQuery query = new BooleanQuery.Builder()
.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD)
.add(new WildcardQuery(new Term("body", "one")), BooleanClause.Occur.SHOULD)
.add(new WildcardQuery(new Term("body", "se*")), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("body", "te")), BooleanClause.Occur.SHOULD)
.add(new WildcardQuery(new Term("body", "*one*")), BooleanClause.Occur.SHOULD)
.add(new FuzzyQuery(new Term("body", "zentence~")), BooleanClause.Occur.SHOULD)
.build();
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
@ -732,8 +734,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
snippets = highlighter.highlight("body", query, topDocs);
assertEquals(1, snippets.length);
// Default formatter bolds each hit:
assertEquals("<b>Test(body:te*)</b> a <b>one(body:one)</b> <b>sentence(body:se*)</b> document.", snippets[0]);
assertEquals("<b>Test(body:te*)</b> a <b>one(body:*one*)</b> <b>sentence(body:zentence~~2)</b> document.", snippets[0]);
ir.close();
}
@ -1054,4 +1055,23 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
}
}
// LUCENE-7717 bug, ordering of MTQ AutomatonQuery detection
public void testRussianPrefixQuery() throws IOException {
Analyzer analyzer = new StandardAnalyzer();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer);
String field = "title";
Document doc = new Document();
doc.add(new Field(field, "я", fieldType)); // Russian char; uses 2 UTF8 bytes
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
Query query = new PrefixQuery(new Term(field, "я"));
TopDocs topDocs = searcher.search(query, 1);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, analyzer);
String[] snippets = highlighter.highlight(field, query, topDocs);
assertEquals("[<b>я</b>]", Arrays.toString(snippets));
ir.close();
}
}