LUCENE-8306: Allow iteration over submatches

2018-07-22 20:59:50 +01:00 · 2018-07-22 20:59:50 +01:00 · a8839b7eab
parent 2826a9550b
commit a8839b7eab
13 changed files with 656 additions and 44 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -198,6 +198,9 @@ Improvements
 * LUCENE-8345, GitHub PR #392: Remove instantiation of redundant wrapper classes for primitives;
  add wrapper class constructors to forbiddenapis.  (Michael Braun via Uwe Schindler)

+* LUCENE-8306: Matches API now allows iteration over sub-matches in Spans (Alan Woodward,
+  Jim Ferenczi, David Smiley)
+
 Other:

 * LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's
--- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java
@ -158,4 +158,13 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
    return queue.top().endOffset();
  }

+  @Override
+  public MatchesIterator getSubMatches() throws IOException {
+    return queue.top().getSubMatches();
+  }
+
+  @Override
+  public Object label() {
+    return queue.top().label();
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseMatcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseMatcher.java
@ -149,4 +149,48 @@ final class ExactPhraseMatcher extends PhraseMatcher {
    return postings[postings.length - 1].postings.endOffset();
  }

+  @Override
+  MatchesIterator getSubMatches() {
+    return new MatchesIterator() {
+
+      int upTo = -1;
+
+      @Override
+      public boolean next() throws IOException {
+        upTo++;
+        return upTo < postings.length;
+      }
+
+      @Override
+      public int startPosition() {
+        return postings[upTo].pos;
+      }
+
+      @Override
+      public int endPosition() {
+        return postings[upTo].pos;
+      }
+
+      @Override
+      public int startOffset() throws IOException {
+        return postings[upTo].postings.startOffset();
+      }
+
+      @Override
+      public int endOffset() throws IOException {
+        return postings[upTo].postings.endOffset();
+      }
+
+      @Override
+      public MatchesIterator getSubMatches() throws IOException {
+        return MatchesIterator.EMPTY_ITERATOR;
+      }
+
+      @Override
+      public Object label() {
+        return this;
+      }
+    };
+  }
+
 }
--- a/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java
@ -28,6 +28,9 @@ import org.apache.lucene.index.LeafReaderContext;
 * positions and/or offsets after each call.  You should not call the position or offset methods
 * before {@link #next()} has been called, or after {@link #next()} has returned {@code false}.
 *
+ * Matches from some queries may span multiple positions.  You can retrieve the positions of
+ * individual matching terms on the current match by calling {@link #getSubMatches()}.
+ *
 * Matches are ordered by start position, and then by end position.  Match intervals may overlap.
 *
 * @see Weight#matches(LeafReaderContext, int)
@ -70,4 +73,59 @@ public interface MatchesIterator {
   */
  int endOffset() throws IOException;

+  /**
+   * Returns a MatchesIterator that iterates over the positions and offsets of individual
+   * terms within the current match
+   *
+   * Should only be called after {@link #next()} has returned {@code true}
+   */
+  MatchesIterator getSubMatches() throws IOException;
+
+  /**
+   * Returns a label identifying the leaf query causing the current match
+   *
+   * Should only be called after {@link #next()} has returned {@code true}
+   */
+  Object label();
+
+  /**
+   * A MatchesIterator that is immediately exhausted
+   */
+  MatchesIterator EMPTY_ITERATOR = new MatchesIterator() {
+    @Override
+    public boolean next() throws IOException {
+      return false;
+    }
+
+    @Override
+    public int startPosition() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int endPosition() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int startOffset() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int endOffset() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public MatchesIterator getSubMatches() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Object label() {
+      return this;
+    }
+  };
+
 }
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@ -269,7 +269,7 @@ public class MultiPhraseQuery extends Query {
            TermState termState = termStates.get(term).get(context);
            if (termState != null) {
              termsEnum.seekExact(term.bytes(), termState);
-              postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS));
+              postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
              totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
            }
          }
@ -294,7 +294,7 @@ public class MultiPhraseQuery extends Query {
          return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
        }
        else {
-          return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
+          return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
        }

      }
@ -647,5 +647,6 @@ public class MultiPhraseQuery extends Query {
    public BytesRef getPayload() throws IOException {
      return posQueue.top().pe.getPayload();
    }
+
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java
@ -88,4 +88,6 @@ abstract class PhraseMatcher {
  public float getMatchCost() {
    return matchCost;
  }
+
+  abstract MatchesIterator getSubMatches() throws IOException;
 }
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@ -446,7 +446,7 @@ public class PhraseQuery extends Query {
            return null;
          }
          te.seekExact(t.bytes(), state);
-          PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
+          PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS);
          postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t);
          totalMatchCost += termPositionsCost(te);
        }
@ -457,7 +457,7 @@ public class PhraseQuery extends Query {
          return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
        }
        else {
-          return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
+          return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
        }
      }

--- a/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java
@ -123,6 +123,16 @@ abstract class PhraseWeight extends Weight {
        public int endOffset() throws IOException {
          return matcher.endOffset();
        }
+
+        @Override
+        public MatchesIterator getSubMatches() throws IOException {
+          return matcher.getSubMatches();
+        }
+
+        @Override
+        public Object label() {
+          return matcher;
+        }
      };
    });
  }
--- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java
@ -54,13 +54,14 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
  private final int slop;
  private final int numPostings;
  private final PhraseQueue pq; // for advancing min position
+  private final boolean captureLeadMatch;

  private int end; // current largest phrase position

  private int leadPosition;
  private int leadOffset;
-  private int currentEndPostings;
-  private int advanceEndPostings;
+  private int leadEndOffset;
+  private int leadOrd;

  private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
  private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
@ -71,10 +72,11 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
  private boolean positioned;
  private int matchLength;

-  SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost) {
+  SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) {
    super(approximation(postings), matchCost);
    this.slop = slop;
    this.numPostings = postings.length;
+    this.captureLeadMatch = captureLeadMatch;
    pq = new PhraseQueue(postings.length);
    phrasePositions = new PhrasePositions[postings.length];
    for (int i = 0; i < postings.length; ++i) {
@ -120,10 +122,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
      return false;
    }
    PhrasePositions pp = pq.pop();
-    assert pp != null;  // if the pq is empty, then positioned == false
-    leadPosition = pp.position + pp.offset;
-    leadOffset = pp.postings.startOffset();
-    currentEndPostings = advanceEndPostings;
+    assert pp != null;  // if the pq is not full, then positioned == false
+    captureLead(pp);
    matchLength = end - pp.position;
    int next = pq.top().position; 
    while (advancePP(pp)) {
@ -137,6 +137,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
        }
        pp = pq.pop();
        next = pq.top().position;
+        assert pp != null;  // if the pq is not full, then positioned == false
        matchLength = end - pp.position;
      } else {
        int matchLength2 = end - pp.position;
@ -144,14 +145,22 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
          matchLength = matchLength2;
        }
      }
-      leadPosition = pp.position + pp.offset;
-      leadOffset = pp.postings.startOffset();
-      currentEndPostings = advanceEndPostings;
+      captureLead(pp);
    }
    positioned = false;
    return matchLength <= slop;
  }

+  private void captureLead(PhrasePositions pp) throws IOException {
+    if (captureLeadMatch == false) {
+      return;
+    }
+    leadOrd = pp.ord;
+    leadPosition = pp.position + pp.offset;
+    leadOffset = pp.postings.startOffset();
+    leadEndOffset = pp.postings.endOffset();
+  }
+
  @Override
  public int startPosition() {
    // when a match is detected, the top postings is advanced until it has moved
@ -160,6 +169,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
    // However, the priority queue doesn't guarantee that the top postings is in fact the
    // earliest in the list, so we need to cycle through all terms to check.
    // this is slow, but Matches is slow anyway...
+    int leadPosition = this.leadPosition;
    for (PhrasePositions pp : phrasePositions) {
      leadPosition = Math.min(leadPosition, pp.position + pp.offset);
    }
@ -168,7 +178,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher {

  @Override
  public int endPosition() {
-    return phrasePositions[currentEndPostings].position + phrasePositions[currentEndPostings].offset;
+    int endPosition = leadPosition;
+    for (PhrasePositions pp : phrasePositions) {
+      if (pp.ord != leadOrd) {
+        endPosition = Math.max(endPosition, pp.position + pp.offset);
+      }
+    }
+    return endPosition;
  }

  @Override
@ -179,6 +195,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
    // However, the priority queue doesn't guarantee that the top postings is in fact the
    // earliest in the list, so we need to cycle through all terms to check
    // this is slow, but Matches is slow anyway...
+    int leadOffset = this.leadOffset;
    for (PhrasePositions pp : phrasePositions) {
      leadOffset = Math.min(leadOffset, pp.postings.startOffset());
    }
@ -187,7 +204,69 @@ final class SloppyPhraseMatcher extends PhraseMatcher {

  @Override
  public int endOffset() throws IOException {
-    return phrasePositions[currentEndPostings].postings.endOffset();
+    int endOffset = leadEndOffset;
+    for (PhrasePositions pp : phrasePositions) {
+      if (pp.ord != leadOrd) {
+        endOffset = Math.max(endOffset, pp.postings.endOffset());
+      }
+    }
+    return endOffset;
+  }
+
+  @Override
+  MatchesIterator getSubMatches() throws IOException {
+    int[][] submatches = new int[phrasePositions.length][3];
+    for (PhrasePositions pp : phrasePositions) {
+      if (pp.ord == leadOrd) {
+        submatches[pp.ord][0] = leadPosition;
+        submatches[pp.ord][1] = leadOffset;
+        submatches[pp.ord][2] = leadEndOffset;
+      }
+      else {
+        submatches[pp.ord][0] = pp.position + pp.offset;
+        submatches[pp.ord][1] = pp.postings.startOffset();
+        submatches[pp.ord][2] = pp.postings.endOffset();
+      }
+    }
+    Arrays.sort(submatches, Comparator.comparingInt(a -> a[0]));
+    return new MatchesIterator() {
+      int upTo = -1;
+      @Override
+      public boolean next() throws IOException {
+        upTo++;
+        return upTo < submatches.length;
+      }
+
+      @Override
+      public int startPosition() {
+        return submatches[upTo][0];
+      }
+
+      @Override
+      public int endPosition() {
+        return submatches[upTo][0];
+      }
+
+      @Override
+      public int startOffset() {
+        return submatches[upTo][1];
+      }
+
+      @Override
+      public int endOffset() {
+        return submatches[upTo][2];
+      }
+
+      @Override
+      public MatchesIterator getSubMatches() {
+        return MatchesIterator.EMPTY_ITERATOR;
+      }
+
+      @Override
+      public Object label() {
+        return this;
+      }
+    };
  }

  /** advance a PhrasePosition and update 'end', return false if exhausted */
@ -197,12 +276,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
    }
    if (pp.position > end) {
      end = pp.position;
-      advanceEndPostings = pp.ord;
-    }
-    if (pp.position == end) {
-      if (pp.ord > advanceEndPostings) {
-        advanceEndPostings = pp.ord;
-      }
    }
    return true;
  }
@ -307,12 +380,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
      pp.firstPosition();
      if (pp.position > end) {
        end = pp.position;
-        advanceEndPostings = pp.ord;
-      }
-      if (pp.position == end) {
-        if (pp.ord > advanceEndPostings) {
-          advanceEndPostings = pp.ord;
-        }
      }
      pq.add(pp);
    }
@ -342,12 +409,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
    for (PhrasePositions pp : phrasePositions) {  // iterate cyclic list: done once handled max
      if (pp.position > end) {
        end = pp.position;
-        advanceEndPostings = pp.ord;
-      }
-      if (pp.position == end) {
-        if (pp.ord > advanceEndPostings) {
-          advanceEndPostings = pp.ord;
-        }
      }
      pq.add(pp);
    }
--- a/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java
@ -67,4 +67,54 @@ class TermMatchesIterator implements MatchesIterator {
    return pe.endOffset();
  }

+  @Override
+  public Object label() {
+    return pe;
+  }
+
+  @Override
+  public MatchesIterator getSubMatches() throws IOException {
+    return new MatchesIterator() {
+
+      boolean exhausted = false;
+
+      @Override
+      public boolean next() {
+        if (exhausted) {
+          return false;
+        }
+        return exhausted = true;
+      }
+
+      @Override
+      public int startPosition() {
+        return pos;
+      }
+
+      @Override
+      public int endPosition() {
+        return pos;
+      }
+
+      @Override
+      public int startOffset() throws IOException {
+        return pe.startOffset();
+      }
+
+      @Override
+      public int endOffset() throws IOException {
+        return pe.endOffset();
+      }
+
+      @Override
+      public MatchesIterator getSubMatches() {
+        return MatchesIterator.EMPTY_ITERATOR;
+      }
+
+      @Override
+      public Object label() {
+        return this;
+      }
+    };
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
@ -18,6 +18,8 @@ package org.apache.lucene.search.spans;


 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
 import java.util.Map;

 import org.apache.lucene.index.LeafReaderContext;
@ -28,6 +30,8 @@ import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.LeafSimScorer;
+import org.apache.lucene.search.Matches;
+import org.apache.lucene.search.MatchesIterator;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.similarities.Similarity;
@ -161,4 +165,131 @@ public abstract class SpanWeight extends Weight {

    return Explanation.noMatch("no matching term");
  }
+
+  @Override
+  public Matches matches(LeafReaderContext context, int doc) throws IOException {
+    return Matches.forField(field, () -> {
+      Spans spans = getSpans(context, Postings.OFFSETS);
+      if (spans == null) {
+        return null;
+      }
+      if (spans.advance(doc) != doc) {
+        return null;
+      }
+      return new MatchesIterator() {
+
+        int innerTermCount = 0;
+        int[][] innerTerms = new int[2][3];
+        SpanCollector termCollector = new SpanCollector() {
+          @Override
+          public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
+            innerTermCount++;
+            if (innerTermCount > innerTerms.length) {
+              int[][] temp = new int[innerTermCount][3];
+              System.arraycopy(innerTerms, 0, temp, 0, innerTermCount - 1);
+              innerTerms = temp;
+            }
+            innerTerms[innerTermCount - 1][0] = position;
+            innerTerms[innerTermCount - 1][1] = postings.startOffset();
+            innerTerms[innerTermCount - 1][2] = postings.endOffset();
+          }
+
+          @Override
+          public void reset() {
+            innerTermCount = 0;
+          }
+        };
+
+        @Override
+        public boolean next() throws IOException {
+          innerTermCount = 0;
+          return spans.nextStartPosition() != Spans.NO_MORE_POSITIONS;
+        }
+
+        @Override
+        public int startPosition() {
+          return spans.startPosition();
+        }
+
+        @Override
+        public int endPosition() {
+          return spans.endPosition() - 1;
+        }
+
+        @Override
+        public int startOffset() throws IOException {
+          if (innerTermCount == 0) {
+            collectInnerTerms();
+          }
+          return innerTerms[0][1];
+        }
+
+        @Override
+        public int endOffset() throws IOException {
+          if (innerTermCount == 0) {
+            collectInnerTerms();
+          }
+          return innerTerms[innerTermCount - 1][2];
+        }
+
+        @Override
+        public MatchesIterator getSubMatches() throws IOException {
+          if (innerTermCount == 0) {
+            collectInnerTerms();
+          }
+          return new MatchesIterator() {
+
+            int upto = -1;
+
+            @Override
+            public boolean next() throws IOException {
+              upto++;
+              return upto < innerTermCount;
+            }
+
+            @Override
+            public int startPosition() {
+              return innerTerms[upto][0];
+            }
+
+            @Override
+            public int endPosition() {
+              return innerTerms[upto][0];
+            }
+
+            @Override
+            public int startOffset() throws IOException {
+              return innerTerms[upto][1];
+            }
+
+            @Override
+            public int endOffset() throws IOException {
+              return innerTerms[upto][2];
+            }
+
+            @Override
+            public MatchesIterator getSubMatches() throws IOException {
+              return MatchesIterator.EMPTY_ITERATOR;
+            }
+
+            @Override
+            public Object label() {
+              return this;
+            }
+          };
+        }
+
+        @Override
+        public Object label() {
+          return SpanWeight.this;
+        }
+
+        void collectInnerTerms() throws IOException {
+          termCollector.reset();
+          spans.collect(termCollector);
+          Arrays.sort(innerTerms, 0, innerTermCount, Comparator.comparing(a -> a[0]));
+        }
+      };
+    });
+  }
 }
--- a/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java
@ -18,8 +18,12 @@
 package org.apache.lucene.search;

 import java.io.IOException;
+import java.util.Arrays;
 import java.util.HashSet;
+import java.util.IdentityHashMap;
+import java.util.Objects;
 import java.util.Set;
+import java.util.stream.Collectors;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@ -29,9 +33,14 @@ import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;

@ -99,7 +108,7 @@ public class TestMatchesIterator extends LuceneTestCase {
      "nothing matches this document"
  };

-  void checkMatches(Query q, String field, int[][] expected) throws IOException {
+  private void checkMatches(Query q, String field, int[][] expected) throws IOException {
    Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
    for (int i = 0; i < expected.length; i++) {
      LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(expected[i][0], searcher.leafContexts));
@ -112,14 +121,40 @@ public class TestMatchesIterator extends LuceneTestCase {
      MatchesIterator it = matches.getMatches(field);
      if (expected[i].length == 1) {
        assertNull(it);
-        return;
+        continue;
      }
      checkFieldMatches(it, expected[i]);
      checkFieldMatches(matches.getMatches(field), expected[i]);  // test multiple calls
    }
  }

-  void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
+  private void checkLabelCount(Query q, String field, int[] expected) throws IOException {
+    Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
+    for (int i = 0; i < expected.length; i++) {
+      LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
+      int doc = i - ctx.docBase;
+      Matches matches = w.matches(ctx, doc);
+      if (matches == null) {
+        assertEquals("Expected to get matches on document " + i, 0, expected[i]);
+        continue;
+      }
+      MatchesIterator it = matches.getMatches(field);
+      if (expected[i] == 0) {
+        assertNull(it);
+        continue;
+      }
+      else {
+        assertNotNull(it);
+      }
+      IdentityHashMap<Object, Integer> labels = new IdentityHashMap<>();
+      while (it.next()) {
+        labels.put(it.label(), 1);
+      }
+      assertEquals(expected[i], labels.size());
+    }
+  }
+
+  private void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
    int pos = 1;
    while (it.next()) {
      //System.out.println(expected[i][pos] + "->" + expected[i][pos + 1] + "[" + expected[i][pos + 2] + "->" + expected[i][pos + 3] + "]");
@ -132,7 +167,7 @@ public class TestMatchesIterator extends LuceneTestCase {
    assertEquals(expected.length, pos);
  }

-  void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
+  private void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
    Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
    for (int i = 0; i < expected.length; i++) {
      LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
@ -148,8 +183,90 @@ public class TestMatchesIterator extends LuceneTestCase {
    }
  }

+  private void checkTermMatches(Query q, String field, TermMatch[][][] expected) throws IOException {
+    Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
+    for (int i = 0; i < expected.length; i++) {
+      LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
+      int doc = i - ctx.docBase;
+      Matches matches = w.matches(ctx, doc);
+      if (matches == null) {
+        assertEquals(expected[i].length, 0);
+        continue;
+      }
+      MatchesIterator it = matches.getMatches(field);
+      if (expected[i].length == 0) {
+        assertNull(it);
+        continue;
+      }
+      checkTerms(expected[i], it);
+    }
+  }
+
+  private void checkTerms(TermMatch[][] expected, MatchesIterator it) throws IOException {
+    int upTo = 0;
+    while (it.next()) {
+      Set<TermMatch> expectedMatches = new HashSet<>(Arrays.asList(expected[upTo]));
+      MatchesIterator submatches = it.getSubMatches();
+      while (submatches.next()) {
+        TermMatch tm = new TermMatch(submatches.startPosition(), submatches.startOffset(), submatches.endOffset());
+        if (expectedMatches.remove(tm) == false) {
+          fail("Unexpected term match: " + tm);
+        }
+      }
+      if (expectedMatches.size() != 0) {
+        fail("Missing term matches: " + expectedMatches.stream().map(Object::toString).collect(Collectors.joining(", ")));
+      }
+      upTo++;
+    }
+    if (upTo < expected.length - 1) {
+      fail("Missing expected match");
+    }
+  }
+
+  static class TermMatch {
+
+    public final int position;
+
+    public final int startOffset;
+
+    public final int endOffset;
+
+    public TermMatch(PostingsEnum pe, int position) throws IOException {
+      this.position = position;
+      this.startOffset = pe.startOffset();
+      this.endOffset = pe.endOffset();
+    }
+
+    public TermMatch(int position, int startOffset, int endOffset) {
+      this.position = position;
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o) return true;
+      if (o == null || getClass() != o.getClass()) return false;
+      TermMatch termMatch = (TermMatch) o;
+      return position == termMatch.position &&
+          startOffset == termMatch.startOffset &&
+          endOffset == termMatch.endOffset;
+    }
+
+    @Override
+    public int hashCode() {
+      return Objects.hash(position, startOffset, endOffset);
+    }
+
+    @Override
+    public String toString() {
+      return position + "[" + startOffset + "->" + endOffset + "]";
+    }
+  }
+
  public void testTermQuery() throws IOException {
-    Query q = new TermQuery(new Term(FIELD_WITH_OFFSETS, "w1"));
+    Term t = new Term(FIELD_WITH_OFFSETS, "w1");
+    Query q = new TermQuery(t);
    checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
        { 0, 0, 0, 0, 2 },
        { 1, 0, 0, 0, 2 },
@ -157,6 +274,14 @@ public class TestMatchesIterator extends LuceneTestCase {
        { 3, 0, 0, 0, 2, 2, 2, 6, 8 },
        { 4 }
    });
+    checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
+        { { new TermMatch(0, 0, 2) } },
+        { { new TermMatch(0, 0, 2) } },
+        { { new TermMatch(0, 0, 2) } },
+        { { new TermMatch(0, 0, 2) }, { new TermMatch(2, 6, 8) } },
+        {}
+    });
+    checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0, 0 });
  }

  public void testTermQueryNoStoredOffsets() throws IOException {
@ -191,6 +316,7 @@ public class TestMatchesIterator extends LuceneTestCase {
        { 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
        { 4 }
    });
+    checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 1, 2, 0, 0 });
  }

  public void testDisjunctionNoPositions() throws IOException {
@ -215,6 +341,7 @@ public class TestMatchesIterator extends LuceneTestCase {
        { 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
        { 4 }
    });
+    checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 0, 2, 0, 0 });
  }

  public void testReqOptNoPositions() throws IOException {
@ -248,6 +375,7 @@ public class TestMatchesIterator extends LuceneTestCase {
        { 3, 0, 0, 0, 2, 2, 2, 6, 8, 3, 3, 9, 11, 5, 5, 15, 17 },
        { 4 }
    });
+    checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 3, 1, 3, 3, 0, 0 });
  }

  public void testMinShouldMatchNoPositions() throws IOException {
@ -331,6 +459,7 @@ public class TestMatchesIterator extends LuceneTestCase {
        { 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 },
        { 4 }
    });
+    checkLabelCount(rq, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 2, 2, 0 });

  }

@ -392,12 +521,55 @@ public class TestMatchesIterator extends LuceneTestCase {
  //  0         1         2         3         4         5         6         7
  // "a phrase sentence with many phrase sentence iterations of a phrase sentence",

+  public void testSloppyPhraseQueryWithRepeats() throws IOException {
+    Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
+    Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
+    PhraseQuery pq = new PhraseQuery(10, FIELD_WITH_OFFSETS, "phrase", "sentence", "sentence");
+    checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
+        { 0 }, { 1 }, { 2 }, { 3 },
+        { 4, 1, 6, 2, 43, 2, 11, 9, 75, 5, 11, 28, 75, 6, 11, 35, 75 }
+    });
+    checkLabelCount(pq, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
+    checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
+        {}, {}, {}, {},
+        { {
+            new TermMatch(1, 2, 8),
+            new TermMatch(2, 9, 17),
+            new TermMatch(6, 35, 43)
+          }, {
+            new TermMatch(5, 28, 34),
+            new TermMatch(2, 9, 17),
+            new TermMatch(11, 67, 75)
+        }, {
+            new TermMatch(5, 28, 34),
+            new TermMatch(6, 35, 43),
+            new TermMatch(11, 67, 75)
+        }, {
+            new TermMatch(10, 60, 66),
+            new TermMatch(6, 35, 43),
+            new TermMatch(11, 67, 75)
+        } }
+    });
+  }
+
  public void testSloppyPhraseQuery() throws IOException {
+    Term a = new Term(FIELD_WITH_OFFSETS, "a");
+    Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
    PhraseQuery pq = new PhraseQuery(4, FIELD_WITH_OFFSETS, "a", "sentence");
    checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
        { 0 }, { 1 }, { 2 }, { 3 },
        { 4, 0, 2, 0, 17, 6, 9, 35, 59, 9, 11, 58, 75 }
    });
+    checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
+        {}, {}, {}, {},
+        { {
+          new TermMatch(0, 0, 1), new TermMatch(2, 9, 17)
+        }, {
+          new TermMatch(9, 58, 59), new TermMatch(6, 35, 43)
+        }, {
+          new TermMatch(9, 58, 59), new TermMatch(11, 67, 75)
+        } }
+    });
  }

  public void testExactPhraseQuery() throws IOException {
@ -407,28 +579,57 @@ public class TestMatchesIterator extends LuceneTestCase {
        { 4, 1, 2, 2, 17, 5, 6, 28, 43, 10, 11, 60, 75 }
    });

+    Term a = new Term(FIELD_WITH_OFFSETS, "a");
+    Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
    PhraseQuery pq2 = new PhraseQuery.Builder()
-        .add(new Term(FIELD_WITH_OFFSETS, "a"))
-        .add(new Term(FIELD_WITH_OFFSETS, "sentence"), 2)
+        .add(a)
+        .add(s, 2)
        .build();
    checkMatches(pq2, FIELD_WITH_OFFSETS, new int[][]{
        { 0 }, { 1 }, { 2 }, { 3 },
        { 4, 0, 2, 0, 17, 9, 11, 58, 75 }
    });
+    checkTermMatches(pq2, FIELD_WITH_OFFSETS, new TermMatch[][][]{
+        {}, {}, {}, {},
+        { {
+          new TermMatch(0, 0, 1), new TermMatch(2, 9, 17)
+        }, {
+          new TermMatch(9, 58, 59), new TermMatch(11, 67, 75)
+        } }
+    });
  }

  //  0         1         2         3         4         5         6         7
  // "a phrase sentence with many phrase sentence iterations of a phrase sentence",

  public void testSloppyMultiPhraseQuery() throws IOException {
+    Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
+    Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
+    Term i = new Term(FIELD_WITH_OFFSETS, "iterations");
    MultiPhraseQuery mpq = new MultiPhraseQuery.Builder()
-        .add(new Term(FIELD_WITH_OFFSETS, "phrase"))
-        .add(new Term[]{ new Term(FIELD_WITH_OFFSETS, "sentence"), new Term(FIELD_WITH_OFFSETS, "iterations") })
+        .add(p)
+        .add(new Term[]{ s, i })
        .setSlop(4)
        .build();
    checkMatches(mpq, FIELD_WITH_OFFSETS, new int[][]{
        { 0 }, { 1 }, { 2 }, { 3 },
-        { 4, 1, 2, 2, 17, 5, 7, 28, 54, 5, 7, 28, 54, 10, 11, 60, 75 }
+        { 4, 1, 2, 2, 17, 5, 6, 28, 43, 5, 7, 28, 54, 10, 11, 60, 75 }
+    });
+    checkTermMatches(mpq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
+        {}, {}, {}, {},
+        { {
+            new TermMatch(1, 2, 8),
+            new TermMatch(2, 9, 17)
+          }, {
+            new TermMatch(5, 28, 34),
+            new TermMatch(6, 35, 43)
+          }, {
+            new TermMatch(5, 28, 34),
+            new TermMatch(7, 44, 54)
+          }, {
+            new TermMatch(10, 60, 66),
+            new TermMatch(11, 67, 75)
+        } }
    });
  }

@ -452,4 +653,35 @@ public class TestMatchesIterator extends LuceneTestCase {
    });
  }

+  //  0         1         2         3         4         5         6         7
+  // "a phrase sentence with many phrase sentence iterations of a phrase sentence",
+
+  public void testSpanQuery() throws IOException {
+    SpanQuery subq = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
+        .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "with")))
+        .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "many")))
+        .build();
+    Query q = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
+        .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "sentence")))
+        .addClause(new SpanOrQuery(subq, new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "iterations"))))
+        .build();
+    checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
+        { 0 }, { 1 }, { 2 }, { 3 },
+        { 4, 2, 4, 9, 27, 6, 7, 35, 54 }
+    });
+    checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
+    checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
+        {}, {}, {}, {},
+        {
+            {
+                new TermMatch(2, 9, 17),
+                new TermMatch(3, 18, 22),
+                new TermMatch(4, 23, 27)
+            }, {
+              new TermMatch(6, 35, 43), new TermMatch(7, 44, 54)
+        }
+        }
+    });
+  }
+
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java
@ -67,4 +67,15 @@ class AssertingMatchesIterator implements MatchesIterator {
    return in.endOffset();
  }

+  @Override
+  public MatchesIterator getSubMatches() throws IOException {
+    assert state == State.ITERATING : state;
+    return in.getSubMatches();
+  }
+
+  @Override
+  public Object label() {
+    assert state == State.ITERATING : state;
+    return in.label();
+  }
 }