From 76c12eeb3d4fe7948a8ab5915df4f2fd8f9ceb8f Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 26 Jan 2014 04:49:18 +0000 Subject: [PATCH] LUCENE-5415: add multitermquery support to PostingsHighlighter git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1561451 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 + .../apache/lucene/search/AutomatonQuery.java | 5 + .../org/apache/lucene/search/FuzzyQuery.java | 8 + .../spans/SpanMultiTermQueryWrapper.java | 23 +- .../MultiTermHighlighting.java | 284 +++++++ .../PostingsHighlighter.java | 60 +- .../TestMultiTermHighlighting.java | 797 ++++++++++++++++++ .../highlight/PostingsSolrHighlighter.java | 14 + .../TestPostingsSolrHighlighter.java | 8 + 9 files changed, 1193 insertions(+), 12 deletions(-) create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java create mode 100644 lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 43d88518769..b3f3aafa2c5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -116,6 +116,9 @@ New Features fixes too. More info: https://github.com/spatial4j/spatial4j/blob/master/CHANGES.md (David Smiley) +* LUCENE-5415: Add multitermquery (wildcards,prefix,etc) to PostingsHighlighter. + (Mike McCandless, Robert Muir) + Build * LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable @@ -201,6 +204,9 @@ Bug fixes the same Directory to multiple concurrent addIndexes calls (which is anyways unusual). (Robert Muir, Mike McCandless) +* LUCENE-5415: SpanMultiTermQueryWrapper didn't handle its boost in + hashcode/equals/tostring/rewrite. (Robert Muir) + API Changes * LUCENE-5339: The facet module was simplified/reworked to make the diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java index 41f15030758..d5801c9cd01 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java @@ -128,4 +128,9 @@ public class AutomatonQuery extends MultiTermQuery { buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } + + /** Returns the automaton used to create this query */ + public Automaton getAutomaton() { + return automaton; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java index 5a70ba67a38..87941d44e2e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -138,6 +138,14 @@ public class FuzzyQuery extends MultiTermQuery { public int getPrefixLength() { return prefixLength; } + + /** + * Returns true if transpositions should be treated as a primitive edit operation. + * If this is false, comparisons will implement the classic Levenshtein algorithm. + */ + public boolean getTranspositions() { + return transpositions; + } @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java index 0914803aeec..849732a9881 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java @@ -100,6 +100,11 @@ public class SpanMultiTermQueryWrapper extends SpanQue public String getField() { return query.getField(); } + + /** Returns the wrapped query */ + public Query getWrappedQuery() { + return query; + } @Override public String toString(String field) { @@ -107,6 +112,10 @@ public class SpanMultiTermQueryWrapper extends SpanQue builder.append("SpanMultiTermQueryWrapper("); builder.append(query.toString(field)); builder.append(")"); + if (getBoost() != 1F) { + builder.append('^'); + builder.append(getBoost()); + } return builder.toString(); } @@ -115,22 +124,26 @@ public class SpanMultiTermQueryWrapper extends SpanQue final Query q = query.rewrite(reader); if (!(q instanceof SpanQuery)) throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod."); + q.setBoost(q.getBoost() * getBoost()); // multiply boost return q; } @Override public int hashCode() { - return 31 * query.hashCode(); + final int prime = 31; + int result = super.hashCode(); + result = prime * result + query.hashCode(); + return result; } @Override - @SuppressWarnings({"rawtypes","unchecked"}) public boolean equals(Object obj) { if (this == obj) return true; - if (obj == null) return false; + if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; - final SpanMultiTermQueryWrapper other = (SpanMultiTermQueryWrapper) obj; - return query.equals(other.query); + SpanMultiTermQueryWrapper other = (SpanMultiTermQueryWrapper) obj; + if (!query.equals(other.query)) return false; + return true; } /** Abstract class that defines how the query is rewritten. */ diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java new file mode 100644 index 00000000000..c5e14f572ce --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java @@ -0,0 +1,284 @@ +package org.apache.lucene.search.postingshighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.AutomatonQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanPositionCheckQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.LevenshteinAutomata; + +/** + * Support for highlighting multiterm queries in PostingsHighlighter. + */ +class MultiTermHighlighting { + + /** + * Extracts all MultiTermQueries for {@code field}, and returns equivalent + * automata that will match terms. + */ + static CharacterRunAutomaton[] extractAutomata(Query query, String field) { + List list = new ArrayList<>(); + if (query instanceof BooleanQuery) { + BooleanClause clauses[] = ((BooleanQuery) query).getClauses(); + for (BooleanClause clause : clauses) { + if (!clause.isProhibited()) { + list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field))); + } + } + } else if (query instanceof DisjunctionMaxQuery) { + for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { + list.addAll(Arrays.asList(extractAutomata(sub, field))); + } + } else if (query instanceof SpanOrQuery) { + for (Query sub : ((SpanOrQuery) query).getClauses()) { + list.addAll(Arrays.asList(extractAutomata(sub, field))); + } + } else if (query instanceof SpanNearQuery) { + for (Query sub : ((SpanNearQuery) query).getClauses()) { + list.addAll(Arrays.asList(extractAutomata(sub, field))); + } + } else if (query instanceof SpanNotQuery) { + list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field))); + } else if (query instanceof SpanPositionCheckQuery) { + list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field))); + } else if (query instanceof SpanMultiTermQueryWrapper) { + list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field))); + } else if (query instanceof AutomatonQuery) { + final AutomatonQuery aq = (AutomatonQuery) query; + if (aq.getField().equals(field)) { + list.add(new CharacterRunAutomaton(aq.getAutomaton()) { + @Override + public String toString() { + return aq.toString(); + } + }); + } + } else if (query instanceof PrefixQuery) { + final PrefixQuery pq = (PrefixQuery) query; + Term prefix = pq.getPrefix(); + if (prefix.field().equals(field)) { + list.add(new CharacterRunAutomaton(BasicOperations.concatenate(BasicAutomata.makeString(prefix.text()), + BasicAutomata.makeAnyString())) { + @Override + public String toString() { + return pq.toString(); + } + }); + } + } else if (query instanceof FuzzyQuery) { + final FuzzyQuery fq = (FuzzyQuery) query; + if (fq.getField().equals(field)) { + String utf16 = fq.getTerm().text(); + int termText[] = new int[utf16.codePointCount(0, utf16.length())]; + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { + termText[j++] = cp = utf16.codePointAt(i); + } + int termLength = termText.length; + int prefixLength = Math.min(fq.getPrefixLength(), termLength); + String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength); + LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions()); + Automaton automaton = builder.toAutomaton(fq.getMaxEdits()); + if (prefixLength > 0) { + Automaton prefix = BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength)); + automaton = BasicOperations.concatenate(prefix, automaton); + } + list.add(new CharacterRunAutomaton(automaton) { + @Override + public String toString() { + return fq.toString(); + } + }); + } + } else if (query instanceof TermRangeQuery) { + final TermRangeQuery tq = (TermRangeQuery) query; + if (tq.getField().equals(field)) { + final CharsRef lowerBound; + if (tq.getLowerTerm() == null) { + lowerBound = null; + } else { + lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString()); + } + + final CharsRef upperBound; + if (tq.getUpperTerm() == null) { + upperBound = null; + } else { + upperBound = new CharsRef(tq.getUpperTerm().utf8ToString()); + } + + final boolean includeLower = tq.includesLower(); + final boolean includeUpper = tq.includesUpper(); + final CharsRef scratch = new CharsRef(); + final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); + + // this is *not* an automaton, but its very simple + list.add(new CharacterRunAutomaton(BasicAutomata.makeEmpty()) { + @Override + public boolean run(char[] s, int offset, int length) { + scratch.chars = s; + scratch.offset = offset; + scratch.length = length; + + if (lowerBound != null) { + int cmp = comparator.compare(scratch, lowerBound); + if (cmp < 0 || (!includeLower && cmp == 0)) { + return false; + } + } + + if (upperBound != null) { + int cmp = comparator.compare(scratch, upperBound); + if (cmp > 0 || (!includeUpper && cmp == 0)) { + return false; + } + } + return true; + } + + @Override + public String toString() { + return tq.toString(); + } + }); + } + } + return list.toArray(new CharacterRunAutomaton[list.size()]); + } + + /** + * Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers} + * matches tokens. + *

+ * This is solely used internally by PostingsHighlighter: DO NOT USE THIS METHOD! + */ + static DocsAndPositionsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException { + final CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + ts.reset(); + + // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq() + // but this would have a performance cost for likely little gain in the user experience, it + // would only serve to make this method less bogus. + // instead, we always return freq() = Integer.MAX_VALUE and let PH terminate based on offset... + + return new DocsAndPositionsEnum() { + int currentDoc = -1; + int currentMatch = -1; + int currentStartOffset = -1; + int currentEndOffset = -1; + TokenStream stream = ts; + + final BytesRef matchDescriptions[] = new BytesRef[matchers.length]; + + @Override + public int nextPosition() throws IOException { + if (stream != null) { + while (stream.incrementToken()) { + for (int i = 0; i < matchers.length; i++) { + if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) { + currentStartOffset = offsetAtt.startOffset(); + currentEndOffset = offsetAtt.endOffset(); + currentMatch = i; + return 0; + } + } + } + stream.end(); + stream.close(); + stream = null; + } + // exhausted + currentStartOffset = currentEndOffset = Integer.MAX_VALUE; + return Integer.MAX_VALUE; + } + + @Override + public int freq() throws IOException { + return Integer.MAX_VALUE; // lie + } + + @Override + public int startOffset() throws IOException { + assert currentStartOffset >= 0; + return currentStartOffset; + } + + @Override + public int endOffset() throws IOException { + assert currentEndOffset >= 0; + return currentEndOffset; + } + + @Override + public BytesRef getPayload() throws IOException { + if (matchDescriptions[currentMatch] == null) { + matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString()); + } + return matchDescriptions[currentMatch]; + } + + @Override + public int docID() { + return currentDoc; + } + + @Override + public int nextDoc() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int advance(int target) throws IOException { + return currentDoc = target; + } + + @Override + public long cost() { + return 0; + } + }; + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java index d39e4843197..c47f8bf52a8 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java @@ -30,6 +30,7 @@ import java.util.PriorityQueue; import java.util.SortedSet; import java.util.TreeSet; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; @@ -50,6 +51,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Simple highlighter that does not analyze fields nor use @@ -64,6 +66,14 @@ import org.apache.lucene.util.UnicodeUtil; * into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}. * Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}. *

+ * You can customize the behavior by subclassing this highlighter, some important hooks: + *

    + *
  • {@link #getBreakIterator(String)}: Customize how the text is divided into passages. + *
  • {@link #getScorer(String)}: Customize how passages are ranked. + *
  • {@link #getFormatter(String)}: Customize how snippets are formatted. + *
  • {@link #getIndexAnalyzer(String)}: Enable highlighting of MultiTermQuerys such as {@code WildcardQuery}. + *
+ *

* WARNING: The code is very new and probably still has some exciting bugs! *

* Example usage: @@ -335,9 +345,9 @@ public class PostingsHighlighter { throw new IllegalArgumentException("invalid number of maxPassagesIn"); } final IndexReader reader = searcher.getIndexReader(); - query = rewrite(query); + Query rewritten = rewrite(query); SortedSet queryTerms = new TreeSet(); - query.extractTerms(queryTerms); + rewritten.extractTerms(queryTerms); IndexReaderContext readerContext = reader.getContext(); List leaves = readerContext.leaves(); @@ -389,7 +399,7 @@ public class PostingsHighlighter { for(Term term : fieldTerms) { terms[termUpto++] = term.bytes(); } - Map fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages); + Map fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query); Object[] result = new Object[docids.length]; for (int j = 0; j < docidsIn.length; j++) { @@ -432,8 +442,18 @@ public class PostingsHighlighter { protected char getMultiValuedSeparator(String field) { return ' '; } + + /** + * Returns the analyzer originally used to index the content for {@code field}. + *

+ * This is used to highlight some MultiTermQueries. + * @return Analyzer or null (the default, meaning no special multi-term processing) + */ + protected Analyzer getIndexAnalyzer(String field) { + return null; + } - private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException { + private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages, Query query) throws IOException { Map highlights = new HashMap(); // reuse in the real sense... for docs in same segment we just advance our old enum @@ -445,6 +465,21 @@ public class PostingsHighlighter { if (fieldFormatter == null) { throw new NullPointerException("PassageFormatter cannot be null"); } + + // check if we should do any multitermprocessing + Analyzer analyzer = getIndexAnalyzer(field); + CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0]; + if (analyzer != null) { + automata = MultiTermHighlighting.extractAutomata(query, field); + } + + final BytesRef allTerms[]; + if (automata.length > 0) { + allTerms = new BytesRef[terms.length + 1]; + System.arraycopy(terms, 0, allTerms, 0, terms.length); + } else { + allTerms = terms; + } for (int i = 0; i < docids.length; i++) { String content = contents[i]; @@ -462,9 +497,14 @@ public class PostingsHighlighter { } if (leaf != lastLeaf) { termsEnum = t.iterator(null); - postings = new DocsAndPositionsEnum[terms.length]; + postings = new DocsAndPositionsEnum[allTerms.length]; } - Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); + if (automata.length > 0) { + DocsAndPositionsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata); + dp.advance(doc - subContext.docBase); + postings[terms.length] = dp; + } + Passage passages[] = highlightDoc(field, allTerms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); if (passages.length == 0) { passages = getEmptyHighlight(field, bi, maxPassages); } @@ -593,7 +633,13 @@ public class PostingsHighlighter { int tf = 0; while (true) { tf++; - current.addMatch(start, end, terms[off.id]); + BytesRef term = terms[off.id]; + if (term == null) { + // multitermquery match, pull from payload + term = off.dp.getPayload(); + assert term != null; + } + current.addMatch(start, end, term); if (off.pos == dp.freq()) { break; // removed from pq } else { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java new file mode 100644 index 00000000000..4b1ce8768ea --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java @@ -0,0 +1,797 @@ +package org.apache.lucene.search.postingshighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanFirstQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; + +/** + * Some tests that override {@link PostingsHighlighter#getIndexAnalyzer} to + * highlight wilcard, fuzzy, etc queries. + */ +@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"}) +public class TestMultiTermHighlighting extends LuceneTestCase { + + public void testWildcards() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + Query query = new WildcardQuery(new Term("body", "te*")); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // wrong field + BooleanQuery bq = new BooleanQuery(); + bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + bq.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.SHOULD); + topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", bq, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testOnePrefix() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + Query query = new PrefixQuery(new Term("body", "te")); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // wrong field + BooleanQuery bq = new BooleanQuery(); + bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + bq.add(new PrefixQuery(new Term("bogus", "te")), BooleanClause.Occur.SHOULD); + topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", bq, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testOneRegexp() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + Query query = new RegexpQuery(new Term("body", "te.*")); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // wrong field + BooleanQuery bq = new BooleanQuery(); + bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + bq.add(new RegexpQuery(new Term("bogus", "te.*")), BooleanClause.Occur.SHOULD); + topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", bq, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testOneFuzzy() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + Query query = new FuzzyQuery(new Term("body", "tets"), 1); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // with prefix + query = new FuzzyQuery(new Term("body", "tets"), 1, 2); + topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // wrong field + BooleanQuery bq = new BooleanQuery(); + bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + bq.add(new FuzzyQuery(new Term("bogus", "tets"), 1), BooleanClause.Occur.SHOULD); + topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", bq, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testRanges() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + Query query = TermRangeQuery.newStringRange("body", "ta", "tf", true, true); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // null start + query = TermRangeQuery.newStringRange("body", null, "tf", true, true); + topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // null end + query = TermRangeQuery.newStringRange("body", "ta", null, true, true); + topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // exact start inclusive + query = TermRangeQuery.newStringRange("body", "test", "tf", true, true); + topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // exact end inclusive + query = TermRangeQuery.newStringRange("body", "ta", "test", true, true); + topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // exact start exclusive + BooleanQuery bq = new BooleanQuery(); + bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + bq.add(TermRangeQuery.newStringRange("body", "test", "tf", false, true), BooleanClause.Occur.SHOULD); + topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", bq, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // exact end exclusive + bq = new BooleanQuery(); + bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + bq.add(TermRangeQuery.newStringRange("body", "ta", "test", true, false), BooleanClause.Occur.SHOULD); + topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", bq, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // wrong field + bq = new BooleanQuery(); + bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + bq.add(TermRangeQuery.newStringRange("bogus", "ta", "tf", true, true), BooleanClause.Occur.SHOULD); + topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", bq, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testWildcardInBoolean() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + BooleanQuery query = new BooleanQuery(); + query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + // must not + query = new BooleanQuery(); + query.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + query.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.MUST_NOT); + topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + snippets = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testWildcardInDisjunctionMax() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + DisjunctionMaxQuery query = new DisjunctionMaxQuery(0); + query.add(new WildcardQuery(new Term("body", "te*"))); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testSpanWildcard() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + Query query = new SpanMultiTermQueryWrapper(new WildcardQuery(new Term("body", "te*"))); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testSpanOr() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + SpanQuery childQuery = new SpanMultiTermQueryWrapper(new WildcardQuery(new Term("body", "te*"))); + Query query = new SpanOrQuery(new SpanQuery[] { childQuery }); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testSpanNear() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + SpanQuery childQuery = new SpanMultiTermQueryWrapper(new WildcardQuery(new Term("body", "te*"))); + Query query = new SpanNearQuery(new SpanQuery[] { childQuery }, 0, true); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testSpanNot() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + SpanQuery include = new SpanMultiTermQueryWrapper(new WildcardQuery(new Term("body", "te*"))); + SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus")); + Query query = new SpanNotQuery(include, exclude); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + public void testSpanPositionCheck() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test."); + iw.addDocument(doc); + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + SpanQuery childQuery = new SpanMultiTermQueryWrapper(new WildcardQuery(new Term("body", "te*"))); + Query query = new SpanFirstQuery(childQuery, 1000000); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(2, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(2, snippets.length); + assertEquals("This is a test.", snippets[0]); + assertEquals("Test a one sentence document.", snippets[1]); + + ir.close(); + dir.close(); + } + + /** Runs a query with two MTQs and confirms the formatter + * can tell which query matched which hit. */ + public void testWhichMTQMatched() throws Exception { + Directory dir = newDirectory(); + // use simpleanalyzer for more natural tokenization (else "test." is a token) + final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("Test a one sentence document."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + }; + BooleanQuery query = new BooleanQuery(); + query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD); + query.add(new WildcardQuery(new Term("body", "one")), BooleanClause.Occur.SHOULD); + query.add(new WildcardQuery(new Term("body", "se*")), BooleanClause.Occur.SHOULD); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(1, snippets.length); + + // Default formatter just bolds each hit: + assertEquals("Test a one sentence document.", snippets[0]); + + // Now use our own formatter, that also stuffs the + // matching term's text into the result: + highlighter = new PostingsHighlighter() { + @Override + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; + } + + @Override + protected PassageFormatter getFormatter(String field) { + return new PassageFormatter() { + + @Override + public Object format(Passage passages[], String content) { + // Copied from DefaultPassageFormatter, but + // tweaked to include the matched term: + StringBuilder sb = new StringBuilder(); + int pos = 0; + for (Passage passage : passages) { + // don't add ellipsis if its the first one, or if its connected. + if (passage.startOffset > pos && pos > 0) { + sb.append("... "); + } + pos = passage.startOffset; + for (int i = 0; i < passage.numMatches; i++) { + int start = passage.matchStarts[i]; + int end = passage.matchEnds[i]; + // its possible to have overlapping terms + if (start > pos) { + sb.append(content, pos, start); + } + if (end > pos) { + sb.append(""); + sb.append(content, Math.max(pos, start), end); + sb.append('('); + sb.append(passage.getMatchTerms()[i].utf8ToString()); + sb.append(')'); + sb.append(""); + pos = end; + } + } + // its possible a "term" from the analyzer could span a sentence boundary. + sb.append(content, pos, Math.max(pos, passage.endOffset)); + pos = passage.endOffset; + } + return sb.toString(); + } + }; + } + }; + + assertEquals(1, topDocs.totalHits); + snippets = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(1, snippets.length); + + // Default formatter bolds each hit: + assertEquals("Test(body:te*) a one(body:one) sentence(body:se*) document.", snippets[0]); + + ir.close(); + dir.close(); + } +} diff --git a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java index 012c9dedadb..a11e1a048f1 100644 --- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java @@ -24,6 +24,7 @@ import java.util.Locale; import java.util.Map; import java.util.Set; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.StoredDocument; import org.apache.lucene.search.Query; import org.apache.lucene.search.postingshighlight.DefaultPassageFormatter; @@ -68,6 +69,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized; * <str name="hl.bs.type">SENTENCE</str> * <int name="hl.maxAnalyzedChars">10000</int> * <str name="hl.multiValuedSeparatorChar"> </str> + * <bool name="hl.highlightMultiTerm">false</bool> * </lst> * </requestHandler> * @@ -98,6 +100,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized; *

  • hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale) *
  • hl.maxAnalyzedChars specifies how many characters at most will be processed in a document. *
  • hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields. + *
  • hl.highlightMultiTerm enables highlighting for range/wildcard/fuzzy/prefix queries. * NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field * * @@ -132,6 +135,8 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn maxPassages[i] = params.getFieldInt(fieldNames[i], HighlightParams.SNIPPETS, 1); } + final IndexSchema schema = req.getSchema(); + PostingsHighlighter highlighter = new PostingsHighlighter(maxLength) { @Override protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { @@ -178,6 +183,15 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn } return sep.charAt(0); } + + @Override + protected Analyzer getIndexAnalyzer(String field) { + if (params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, false)) { + return schema.getAnalyzer(); + } else { + return null; + } + } }; Map snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages); diff --git a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java index 094fb4ea9ac..102533930c8 100644 --- a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java +++ b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java @@ -155,4 +155,12 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 { req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"), "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='Document one has a first <i>sentence</i>.'"); } + + public void testWildcard() { + assertQ("simplest test", + req("q", "text:doc*ment", "sort", "id asc", "hl", "true", "hl.highlightMultiTerm", "true"), + "count(//lst[@name='highlighting']/*)=2", + "//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='document one'", + "//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second document'"); + } }