From c5b49baa592c0ecdae600ba0b1f31bc347bdc5f5 Mon Sep 17 00:00:00 2001
From: David Wayne Smiley <dsmiley@apache.org>
Date: Sun, 13 Sep 2015 03:54:46 +0000
Subject: [PATCH] LUCENE-5503: Highlighter WSTE didn't always convert a
 PhraseQuery to a SpanQuery correctly.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1702695 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  5 ++
 .../highlight/WeightedSpanTermExtractor.java  | 53 +++++---------
 .../highlight/HighlighterPhraseTest.java      | 72 +++++++++++++++++++
 3 files changed, 96 insertions(+), 34 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4aefbc92c76..3ccaa4cdace 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -126,6 +126,11 @@ Bug Fixes
 * LUCENE-6792: Fix TermsQuery.toString() to work with binary terms.
   (Ruslan Muzhikov, Robert Muir)
 
+* LUCENE-5503: When Highlighter's WeightedSpanTermExtractor converts a
+  PhraseQuery to an equivalent SpanQuery, it would sometimes use a slop that is
+  too low (no highlight) or determine inOrder wrong.
+  (Tim Allison via David Smiley)
+
 Other
 
 * LUCENE-6174: Improve "ant eclipse" to select right JRE for building.
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
index 8e8c8cb8637..da572330b60 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@@ -17,6 +17,16 @@ package org.apache.lucene.search.highlight;
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.BinaryDocValues;
@@ -59,17 +69,6 @@ import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.IOUtils;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-
 /**
  * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether 
  * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
@@ -121,33 +120,19 @@ public class WeightedSpanTermExtractor {
       for (int i = 0; i < phraseQueryTerms.length; i++) {
         clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
       }
-      int slop = phraseQuery.getSlop();
+
+      // sum position increments beyond 1
+      int positionGaps = 0;
       int[] positions = phraseQuery.getPositions();
-      // add largest position increment to slop
-      if (positions.length > 0) {
-        int lastPos = positions[0];
-        int largestInc = 0;
-        int sz = positions.length;
-        for (int i = 1; i < sz; i++) {
-          int pos = positions[i];
-          int inc = pos - lastPos;
-          if (inc > largestInc) {
-            largestInc = inc;
-          }
-          lastPos = pos;
-        }
-        if(largestInc > 1) {
-          slop += largestInc;
-        }
+      if (positions.length >= 2) {
+        // positions are in increasing order.   max(0,...) is just a safeguard.
+        positionGaps = Math.max(0, positions[positions.length-1] - positions[0] - positions.length + 1);
       }
 
-      boolean inorder = false;
+      //if original slop is 0 then require inOrder
+      boolean inorder = (phraseQuery.getSlop() == 0);
 
-      if (slop == 0) {
-        inorder = true;
-      }
-
-      SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
+      SpanNearQuery sp = new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder);
       extractWeightedSpanTerms(terms, sp, boost);
     } else if (query instanceof TermQuery) {
       extractWeightedTerms(terms, query, boost);
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
index 6828f8d9fab..afb5c7b7631 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
@@ -20,10 +20,12 @@ package org.apache.lucene.search.highlight;
 import java.io.IOException;
 
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
@@ -268,6 +270,76 @@ public class HighlighterPhraseTest extends LuceneTestCase {
     }
   }
 
+  //shows the need to sum the increments in WeightedSpanTermExtractor
+  public void testStopWords() throws IOException, InvalidTokenOffsetsException {
+    MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
+        MockTokenFilter.ENGLISH_STOPSET);    
+    final String TEXT = "the ab the the cd the the the ef the";
+    final Directory directory = newDirectory();
+    try (IndexWriter indexWriter = new IndexWriter(directory,
+        newIndexWriterConfig(stopAnalyzer))) {
+      final Document document = new Document();
+      document.add(newTextField(FIELD, TEXT, Store.YES));
+      indexWriter.addDocument(document);
+    }
+    try (IndexReader indexReader = DirectoryReader.open(directory)) {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = newSearcher(indexReader);
+      //equivalent of "ab the the cd the the the ef"
+      final PhraseQuery phraseQuery = new PhraseQuery.Builder()
+          .add(new Term(FIELD, "ab"), 0)
+          .add(new Term(FIELD, "cd"), 3)
+          .add(new Term(FIELD, "ef"), 7).build();
+
+      TopDocs hits = indexSearcher.search(phraseQuery, 100);
+      assertEquals(1, hits.totalHits);
+      final Highlighter highlighter = new Highlighter(
+          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+          new QueryScorer(phraseQuery));
+      assertEquals(1, highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10).length);
+    } finally {
+      directory.close();
+    }
+  }
+  
+  //shows the need to require inOrder if getSlop() == 0, not if final slop == 0 
+  //in WeightedSpanTermExtractor
+  public void testInOrderWithStopWords() throws IOException, InvalidTokenOffsetsException {
+    MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
+        MockTokenFilter.ENGLISH_STOPSET);        
+    final String TEXT = "the cd the ab the the the the the the the ab the cd the";
+    final Directory directory = newDirectory();
+    try (IndexWriter indexWriter = new IndexWriter(directory,
+        newIndexWriterConfig(stopAnalyzer))) {
+      final Document document = new Document();
+      document.add(newTextField(FIELD, TEXT, Store.YES));
+      indexWriter.addDocument(document);
+    }
+    try (IndexReader indexReader = DirectoryReader.open(directory)) {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = newSearcher(indexReader);
+      //equivalent of "ab the cd"
+      final PhraseQuery phraseQuery = new PhraseQuery.Builder()
+          .add(new Term(FIELD, "ab"), 0)
+          .add(new Term(FIELD, "cd"), 2).build();
+
+      TopDocs hits = indexSearcher.search(phraseQuery, 100);
+      assertEquals(1, hits.totalHits);
+
+      final Highlighter highlighter = new Highlighter(
+          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+          new QueryScorer(phraseQuery));
+      String[] frags = highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10);
+      assertEquals(1, frags.length);
+      assertTrue("contains <B>ab</B> the <B>cd</B>",
+          (frags[0].contains("<B>ab</B> the <B>cd</B>")));
+      assertTrue("does not contain <B>cd</B> the <B>ab</B>",
+          (!frags[0].contains("<B>cd</B> the <B>ab</B>")));
+    } finally {
+      directory.close();
+    }
+  }
+
   private static final class TokenStreamSparse extends TokenStream {
     private Token[] tokens;