From c5b49baa592c0ecdae600ba0b1f31bc347bdc5f5 Mon Sep 17 00:00:00 2001 From: David Wayne Smiley Date: Sun, 13 Sep 2015 03:54:46 +0000 Subject: [PATCH] LUCENE-5503: Highlighter WSTE didn't always convert a PhraseQuery to a SpanQuery correctly. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1702695 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 ++ .../highlight/WeightedSpanTermExtractor.java | 53 +++++--------- .../highlight/HighlighterPhraseTest.java | 72 +++++++++++++++++++ 3 files changed, 96 insertions(+), 34 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4aefbc92c76..3ccaa4cdace 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -126,6 +126,11 @@ Bug Fixes * LUCENE-6792: Fix TermsQuery.toString() to work with binary terms. (Ruslan Muzhikov, Robert Muir) +* LUCENE-5503: When Highlighter's WeightedSpanTermExtractor converts a + PhraseQuery to an equivalent SpanQuery, it would sometimes use a slop that is + too low (no highlight) or determine inOrder wrong. + (Tim Allison via David Smiley) + Other * LUCENE-6174: Improve "ant eclipse" to select right JRE for building. diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index 8e8c8cb8637..da572330b60 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -17,6 +17,16 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.BinaryDocValues; @@ -59,17 +69,6 @@ import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - - /** * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}. @@ -121,33 +120,19 @@ public class WeightedSpanTermExtractor { for (int i = 0; i < phraseQueryTerms.length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } - int slop = phraseQuery.getSlop(); + + // sum position increments beyond 1 + int positionGaps = 0; int[] positions = phraseQuery.getPositions(); - // add largest position increment to slop - if (positions.length > 0) { - int lastPos = positions[0]; - int largestInc = 0; - int sz = positions.length; - for (int i = 1; i < sz; i++) { - int pos = positions[i]; - int inc = pos - lastPos; - if (inc > largestInc) { - largestInc = inc; - } - lastPos = pos; - } - if(largestInc > 1) { - slop += largestInc; - } + if (positions.length >= 2) { + // positions are in increasing order. max(0,...) is just a safeguard. + positionGaps = Math.max(0, positions[positions.length-1] - positions[0] - positions.length + 1); } - boolean inorder = false; + //if original slop is 0 then require inOrder + boolean inorder = (phraseQuery.getSlop() == 0); - if (slop == 0) { - inorder = true; - } - - SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); + SpanNearQuery sp = new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder); extractWeightedSpanTerms(terms, sp, boost); } else if (query instanceof TermQuery) { extractWeightedTerms(terms, query, boost); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java index 6828f8d9fab..afb5c7b7631 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java @@ -20,10 +20,12 @@ package org.apache.lucene.search.highlight; import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Field.Store; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; @@ -268,6 +270,76 @@ public class HighlighterPhraseTest extends LuceneTestCase { } } + //shows the need to sum the increments in WeightedSpanTermExtractor + public void testStopWords() throws IOException, InvalidTokenOffsetsException { + MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, + MockTokenFilter.ENGLISH_STOPSET); + final String TEXT = "the ab the the cd the the the ef the"; + final Directory directory = newDirectory(); + try (IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(stopAnalyzer))) { + final Document document = new Document(); + document.add(newTextField(FIELD, TEXT, Store.YES)); + indexWriter.addDocument(document); + } + try (IndexReader indexReader = DirectoryReader.open(directory)) { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = newSearcher(indexReader); + //equivalent of "ab the the cd the the the ef" + final PhraseQuery phraseQuery = new PhraseQuery.Builder() + .add(new Term(FIELD, "ab"), 0) + .add(new Term(FIELD, "cd"), 3) + .add(new Term(FIELD, "ef"), 7).build(); + + TopDocs hits = indexSearcher.search(phraseQuery, 100); + assertEquals(1, hits.totalHits); + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(phraseQuery)); + assertEquals(1, highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10).length); + } finally { + directory.close(); + } + } + + //shows the need to require inOrder if getSlop() == 0, not if final slop == 0 + //in WeightedSpanTermExtractor + public void testInOrderWithStopWords() throws IOException, InvalidTokenOffsetsException { + MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, + MockTokenFilter.ENGLISH_STOPSET); + final String TEXT = "the cd the ab the the the the the the the ab the cd the"; + final Directory directory = newDirectory(); + try (IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(stopAnalyzer))) { + final Document document = new Document(); + document.add(newTextField(FIELD, TEXT, Store.YES)); + indexWriter.addDocument(document); + } + try (IndexReader indexReader = DirectoryReader.open(directory)) { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = newSearcher(indexReader); + //equivalent of "ab the cd" + final PhraseQuery phraseQuery = new PhraseQuery.Builder() + .add(new Term(FIELD, "ab"), 0) + .add(new Term(FIELD, "cd"), 2).build(); + + TopDocs hits = indexSearcher.search(phraseQuery, 100); + assertEquals(1, hits.totalHits); + + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(phraseQuery)); + String[] frags = highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10); + assertEquals(1, frags.length); + assertTrue("contains ab the cd", + (frags[0].contains("ab the cd"))); + assertTrue("does not contain cd the ab", + (!frags[0].contains("cd the ab"))); + } finally { + directory.close(); + } + } + private static final class TokenStreamSparse extends TokenStream { private Token[] tokens;