LUCENE-5503: Highlighter WSTE didn't always convert a PhraseQuery to a SpanQuery correctly.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1702695 13f79535-47bb-0310-9956-ffa450edef68
2025-02-14 22:16:00 +00:00 · 2015-09-13 03:54:46 +00:00 · 2015-09-13 03:54:46 +00:00 · c5b49baa59
commit c5b49baa59
parent 01adad8899
3 changed files with 96 additions and 34 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -126,6 +126,11 @@ Bug Fixes
 * LUCENE-6792: Fix TermsQuery.toString() to work with binary terms.
  (Ruslan Muzhikov, Robert Muir)

+* LUCENE-5503: When Highlighter's WeightedSpanTermExtractor converts a
+  PhraseQuery to an equivalent SpanQuery, it would sometimes use a slop that is
+  too low (no highlight) or determine inOrder wrong.
+  (Tim Allison via David Smiley)
+
 Other

 * LUCENE-6174: Improve "ant eclipse" to select right JRE for building.
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@ -17,6 +17,16 @@ package org.apache.lucene.search.highlight;
 * limitations under the License.
 */

+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.BinaryDocValues;
@ -59,17 +69,6 @@ import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.IOUtils;

-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-
 /**
 * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether 
 * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
@ -121,33 +120,19 @@ public class WeightedSpanTermExtractor {
      for (int i = 0; i < phraseQueryTerms.length; i++) {
        clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
      }
-      int slop = phraseQuery.getSlop();
+
+      // sum position increments beyond 1
+      int positionGaps = 0;
      int[] positions = phraseQuery.getPositions();
-      // add largest position increment to slop
-      if (positions.length > 0) {
-        int lastPos = positions[0];
-        int largestInc = 0;
-        int sz = positions.length;
-        for (int i = 1; i < sz; i++) {
-          int pos = positions[i];
-          int inc = pos - lastPos;
-          if (inc > largestInc) {
-            largestInc = inc;
-          }
-          lastPos = pos;
-        }
-        if(largestInc > 1) {
-          slop += largestInc;
-        }
+      if (positions.length >= 2) {
+        // positions are in increasing order.   max(0,...) is just a safeguard.
+        positionGaps = Math.max(0, positions[positions.length-1] - positions[0] - positions.length + 1);
      }

-      boolean inorder = false;
+      //if original slop is 0 then require inOrder
+      boolean inorder = (phraseQuery.getSlop() == 0);

-      if (slop == 0) {
-        inorder = true;
-      }
-
-      SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
+      SpanNearQuery sp = new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder);
      extractWeightedSpanTerms(terms, sp, boost);
    } else if (query instanceof TermQuery) {
      extractWeightedTerms(terms, query, boost);
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
@ -20,10 +20,12 @@ package org.apache.lucene.search.highlight;
 import java.io.IOException;

 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
@ -268,6 +270,76 @@ public class HighlighterPhraseTest extends LuceneTestCase {
    }
  }

+  //shows the need to sum the increments in WeightedSpanTermExtractor
+  public void testStopWords() throws IOException, InvalidTokenOffsetsException {
+    MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
+        MockTokenFilter.ENGLISH_STOPSET);    
+    final String TEXT = "the ab the the cd the the the ef the";
+    final Directory directory = newDirectory();
+    try (IndexWriter indexWriter = new IndexWriter(directory,
+        newIndexWriterConfig(stopAnalyzer))) {
+      final Document document = new Document();
+      document.add(newTextField(FIELD, TEXT, Store.YES));
+      indexWriter.addDocument(document);
+    }
+    try (IndexReader indexReader = DirectoryReader.open(directory)) {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = newSearcher(indexReader);
+      //equivalent of "ab the the cd the the the ef"
+      final PhraseQuery phraseQuery = new PhraseQuery.Builder()
+          .add(new Term(FIELD, "ab"), 0)
+          .add(new Term(FIELD, "cd"), 3)
+          .add(new Term(FIELD, "ef"), 7).build();
+
+      TopDocs hits = indexSearcher.search(phraseQuery, 100);
+      assertEquals(1, hits.totalHits);
+      final Highlighter highlighter = new Highlighter(
+          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+          new QueryScorer(phraseQuery));
+      assertEquals(1, highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10).length);
+    } finally {
+      directory.close();
+    }
+  }
+  
+  //shows the need to require inOrder if getSlop() == 0, not if final slop == 0 
+  //in WeightedSpanTermExtractor
+  public void testInOrderWithStopWords() throws IOException, InvalidTokenOffsetsException {
+    MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
+        MockTokenFilter.ENGLISH_STOPSET);        
+    final String TEXT = "the cd the ab the the the the the the the ab the cd the";
+    final Directory directory = newDirectory();
+    try (IndexWriter indexWriter = new IndexWriter(directory,
+        newIndexWriterConfig(stopAnalyzer))) {
+      final Document document = new Document();
+      document.add(newTextField(FIELD, TEXT, Store.YES));
+      indexWriter.addDocument(document);
+    }
+    try (IndexReader indexReader = DirectoryReader.open(directory)) {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = newSearcher(indexReader);
+      //equivalent of "ab the cd"
+      final PhraseQuery phraseQuery = new PhraseQuery.Builder()
+          .add(new Term(FIELD, "ab"), 0)
+          .add(new Term(FIELD, "cd"), 2).build();
+
+      TopDocs hits = indexSearcher.search(phraseQuery, 100);
+      assertEquals(1, hits.totalHits);
+
+      final Highlighter highlighter = new Highlighter(
+          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+          new QueryScorer(phraseQuery));
+      String[] frags = highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10);
+      assertEquals(1, frags.length);
+      assertTrue("contains <B>ab</B> the <B>cd</B>",
+          (frags[0].contains("<B>ab</B> the <B>cd</B>")));
+      assertTrue("does not contain <B>cd</B> the <B>ab</B>",
+          (!frags[0].contains("<B>cd</B> the <B>ab</B>")));
+    } finally {
+      directory.close();
+    }
+  }
+
  private static final class TokenStreamSparse extends TokenStream {
    private Token[] tokens;