diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8257dbc801f..47769cd8145 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -198,6 +198,9 @@ Improvements * LUCENE-8345, GitHub PR #392: Remove instantiation of redundant wrapper classes for primitives; add wrapper class constructors to forbiddenapis. (Michael Braun via Uwe Schindler) +* LUCENE-8306: Matches API now allows iteration over sub-matches in Spans (Alan Woodward, + Jim Ferenczi, David Smiley) + Other: * LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java index 975199bd141..cff723b9ab9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java @@ -158,4 +158,13 @@ final class DisjunctionMatchesIterator implements MatchesIterator { return queue.top().endOffset(); } + @Override + public MatchesIterator getSubMatches() throws IOException { + return queue.top().getSubMatches(); + } + + @Override + public Object label() { + return queue.top().label(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseMatcher.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseMatcher.java index b95077d0971..6fb49b16d13 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseMatcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseMatcher.java @@ -149,4 +149,48 @@ final class ExactPhraseMatcher extends PhraseMatcher { return postings[postings.length - 1].postings.endOffset(); } + @Override + MatchesIterator getSubMatches() { + return new MatchesIterator() { + + int upTo = -1; + + @Override + public boolean next() throws IOException { + upTo++; + return upTo < postings.length; + } + + @Override + public int startPosition() { + return postings[upTo].pos; + } + + @Override + public int endPosition() { + return postings[upTo].pos; + } + + @Override + public int startOffset() throws IOException { + return postings[upTo].postings.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return postings[upTo].postings.endOffset(); + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + return MatchesIterator.EMPTY_ITERATOR; + } + + @Override + public Object label() { + return this; + } + }; + } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java index 450a352720e..5814d7cb88b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java @@ -28,6 +28,9 @@ import org.apache.lucene.index.LeafReaderContext; * positions and/or offsets after each call. You should not call the position or offset methods * before {@link #next()} has been called, or after {@link #next()} has returned {@code false}. * + * Matches from some queries may span multiple positions. You can retrieve the positions of + * individual matching terms on the current match by calling {@link #getSubMatches()}. + * * Matches are ordered by start position, and then by end position. Match intervals may overlap. * * @see Weight#matches(LeafReaderContext, int) @@ -70,4 +73,59 @@ public interface MatchesIterator { */ int endOffset() throws IOException; + /** + * Returns a MatchesIterator that iterates over the positions and offsets of individual + * terms within the current match + * + * Should only be called after {@link #next()} has returned {@code true} + */ + MatchesIterator getSubMatches() throws IOException; + + /** + * Returns a label identifying the leaf query causing the current match + * + * Should only be called after {@link #next()} has returned {@code true} + */ + Object label(); + + /** + * A MatchesIterator that is immediately exhausted + */ + MatchesIterator EMPTY_ITERATOR = new MatchesIterator() { + @Override + public boolean next() throws IOException { + return false; + } + + @Override + public int startPosition() { + throw new UnsupportedOperationException(); + } + + @Override + public int endPosition() { + throw new UnsupportedOperationException(); + } + + @Override + public int startOffset() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int endOffset() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Object label() { + return this; + } + }; + } diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 22b71279012..c8d22baec91 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -269,7 +269,7 @@ public class MultiPhraseQuery extends Query { TermState termState = termStates.get(term).get(context); if (termState != null) { termsEnum.seekExact(term.bytes(), termState); - postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS)); + postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS)); totalMatchCost += PhraseQuery.termPositionsCost(termsEnum); } } @@ -294,7 +294,7 @@ public class MultiPhraseQuery extends Query { return new ExactPhraseMatcher(postingsFreqs, totalMatchCost); } else { - return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost); + return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets); } } @@ -647,5 +647,6 @@ public class MultiPhraseQuery extends Query { public BytesRef getPayload() throws IOException { return posQueue.top().pe.getPayload(); } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java b/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java index 81040d517c4..c4b1e36684e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java @@ -88,4 +88,6 @@ abstract class PhraseMatcher { public float getMatchCost() { return matchCost; } + + abstract MatchesIterator getSubMatches() throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 70d2e09ef3b..8f042716a65 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -446,7 +446,7 @@ public class PhraseQuery extends Query { return null; } te.seekExact(t.bytes(), state); - PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS); + PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS); postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t); totalMatchCost += termPositionsCost(te); } @@ -457,7 +457,7 @@ public class PhraseQuery extends Query { return new ExactPhraseMatcher(postingsFreqs, totalMatchCost); } else { - return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost); + return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java b/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java index 90fa537c366..2547b8c8c54 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java @@ -123,6 +123,16 @@ abstract class PhraseWeight extends Weight { public int endOffset() throws IOException { return matcher.endOffset(); } + + @Override + public MatchesIterator getSubMatches() throws IOException { + return matcher.getSubMatches(); + } + + @Override + public Object label() { + return matcher; + } }; }); } diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java index 326816d1428..e9c45195713 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java @@ -54,13 +54,14 @@ final class SloppyPhraseMatcher extends PhraseMatcher { private final int slop; private final int numPostings; private final PhraseQueue pq; // for advancing min position + private final boolean captureLeadMatch; private int end; // current largest phrase position private int leadPosition; private int leadOffset; - private int currentEndPostings; - private int advanceEndPostings; + private int leadEndOffset; + private int leadOrd; private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc) private boolean checkedRpts; // flag to only check for repetitions in first candidate doc @@ -71,10 +72,11 @@ final class SloppyPhraseMatcher extends PhraseMatcher { private boolean positioned; private int matchLength; - SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost) { + SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) { super(approximation(postings), matchCost); this.slop = slop; this.numPostings = postings.length; + this.captureLeadMatch = captureLeadMatch; pq = new PhraseQueue(postings.length); phrasePositions = new PhrasePositions[postings.length]; for (int i = 0; i < postings.length; ++i) { @@ -120,10 +122,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher { return false; } PhrasePositions pp = pq.pop(); - assert pp != null; // if the pq is empty, then positioned == false - leadPosition = pp.position + pp.offset; - leadOffset = pp.postings.startOffset(); - currentEndPostings = advanceEndPostings; + assert pp != null; // if the pq is not full, then positioned == false + captureLead(pp); matchLength = end - pp.position; int next = pq.top().position; while (advancePP(pp)) { @@ -137,6 +137,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher { } pp = pq.pop(); next = pq.top().position; + assert pp != null; // if the pq is not full, then positioned == false matchLength = end - pp.position; } else { int matchLength2 = end - pp.position; @@ -144,14 +145,22 @@ final class SloppyPhraseMatcher extends PhraseMatcher { matchLength = matchLength2; } } - leadPosition = pp.position + pp.offset; - leadOffset = pp.postings.startOffset(); - currentEndPostings = advanceEndPostings; + captureLead(pp); } positioned = false; return matchLength <= slop; } + private void captureLead(PhrasePositions pp) throws IOException { + if (captureLeadMatch == false) { + return; + } + leadOrd = pp.ord; + leadPosition = pp.position + pp.offset; + leadOffset = pp.postings.startOffset(); + leadEndOffset = pp.postings.endOffset(); + } + @Override public int startPosition() { // when a match is detected, the top postings is advanced until it has moved @@ -160,6 +169,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher { // However, the priority queue doesn't guarantee that the top postings is in fact the // earliest in the list, so we need to cycle through all terms to check. // this is slow, but Matches is slow anyway... + int leadPosition = this.leadPosition; for (PhrasePositions pp : phrasePositions) { leadPosition = Math.min(leadPosition, pp.position + pp.offset); } @@ -168,7 +178,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher { @Override public int endPosition() { - return phrasePositions[currentEndPostings].position + phrasePositions[currentEndPostings].offset; + int endPosition = leadPosition; + for (PhrasePositions pp : phrasePositions) { + if (pp.ord != leadOrd) { + endPosition = Math.max(endPosition, pp.position + pp.offset); + } + } + return endPosition; } @Override @@ -179,6 +195,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher { // However, the priority queue doesn't guarantee that the top postings is in fact the // earliest in the list, so we need to cycle through all terms to check // this is slow, but Matches is slow anyway... + int leadOffset = this.leadOffset; for (PhrasePositions pp : phrasePositions) { leadOffset = Math.min(leadOffset, pp.postings.startOffset()); } @@ -187,7 +204,69 @@ final class SloppyPhraseMatcher extends PhraseMatcher { @Override public int endOffset() throws IOException { - return phrasePositions[currentEndPostings].postings.endOffset(); + int endOffset = leadEndOffset; + for (PhrasePositions pp : phrasePositions) { + if (pp.ord != leadOrd) { + endOffset = Math.max(endOffset, pp.postings.endOffset()); + } + } + return endOffset; + } + + @Override + MatchesIterator getSubMatches() throws IOException { + int[][] submatches = new int[phrasePositions.length][3]; + for (PhrasePositions pp : phrasePositions) { + if (pp.ord == leadOrd) { + submatches[pp.ord][0] = leadPosition; + submatches[pp.ord][1] = leadOffset; + submatches[pp.ord][2] = leadEndOffset; + } + else { + submatches[pp.ord][0] = pp.position + pp.offset; + submatches[pp.ord][1] = pp.postings.startOffset(); + submatches[pp.ord][2] = pp.postings.endOffset(); + } + } + Arrays.sort(submatches, Comparator.comparingInt(a -> a[0])); + return new MatchesIterator() { + int upTo = -1; + @Override + public boolean next() throws IOException { + upTo++; + return upTo < submatches.length; + } + + @Override + public int startPosition() { + return submatches[upTo][0]; + } + + @Override + public int endPosition() { + return submatches[upTo][0]; + } + + @Override + public int startOffset() { + return submatches[upTo][1]; + } + + @Override + public int endOffset() { + return submatches[upTo][2]; + } + + @Override + public MatchesIterator getSubMatches() { + return MatchesIterator.EMPTY_ITERATOR; + } + + @Override + public Object label() { + return this; + } + }; } /** advance a PhrasePosition and update 'end', return false if exhausted */ @@ -197,12 +276,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher { } if (pp.position > end) { end = pp.position; - advanceEndPostings = pp.ord; - } - if (pp.position == end) { - if (pp.ord > advanceEndPostings) { - advanceEndPostings = pp.ord; - } } return true; } @@ -307,12 +380,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher { pp.firstPosition(); if (pp.position > end) { end = pp.position; - advanceEndPostings = pp.ord; - } - if (pp.position == end) { - if (pp.ord > advanceEndPostings) { - advanceEndPostings = pp.ord; - } } pq.add(pp); } @@ -342,12 +409,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher { for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max if (pp.position > end) { end = pp.position; - advanceEndPostings = pp.ord; - } - if (pp.position == end) { - if (pp.ord > advanceEndPostings) { - advanceEndPostings = pp.ord; - } } pq.add(pp); } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java index defc3af5751..23858c6b5e8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java @@ -67,4 +67,54 @@ class TermMatchesIterator implements MatchesIterator { return pe.endOffset(); } + @Override + public Object label() { + return pe; + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + return new MatchesIterator() { + + boolean exhausted = false; + + @Override + public boolean next() { + if (exhausted) { + return false; + } + return exhausted = true; + } + + @Override + public int startPosition() { + return pos; + } + + @Override + public int endPosition() { + return pos; + } + + @Override + public int startOffset() throws IOException { + return pe.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return pe.endOffset(); + } + + @Override + public MatchesIterator getSubMatches() { + return MatchesIterator.EMPTY_ITERATOR; + } + + @Override + public Object label() { + return this; + } + }; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java index 0313d56510c..ca789ce4c80 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -18,6 +18,8 @@ package org.apache.lucene.search.spans; import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; @@ -28,6 +30,8 @@ import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafSimScorer; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity; @@ -161,4 +165,131 @@ public abstract class SpanWeight extends Weight { return Explanation.noMatch("no matching term"); } + + @Override + public Matches matches(LeafReaderContext context, int doc) throws IOException { + return Matches.forField(field, () -> { + Spans spans = getSpans(context, Postings.OFFSETS); + if (spans == null) { + return null; + } + if (spans.advance(doc) != doc) { + return null; + } + return new MatchesIterator() { + + int innerTermCount = 0; + int[][] innerTerms = new int[2][3]; + SpanCollector termCollector = new SpanCollector() { + @Override + public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException { + innerTermCount++; + if (innerTermCount > innerTerms.length) { + int[][] temp = new int[innerTermCount][3]; + System.arraycopy(innerTerms, 0, temp, 0, innerTermCount - 1); + innerTerms = temp; + } + innerTerms[innerTermCount - 1][0] = position; + innerTerms[innerTermCount - 1][1] = postings.startOffset(); + innerTerms[innerTermCount - 1][2] = postings.endOffset(); + } + + @Override + public void reset() { + innerTermCount = 0; + } + }; + + @Override + public boolean next() throws IOException { + innerTermCount = 0; + return spans.nextStartPosition() != Spans.NO_MORE_POSITIONS; + } + + @Override + public int startPosition() { + return spans.startPosition(); + } + + @Override + public int endPosition() { + return spans.endPosition() - 1; + } + + @Override + public int startOffset() throws IOException { + if (innerTermCount == 0) { + collectInnerTerms(); + } + return innerTerms[0][1]; + } + + @Override + public int endOffset() throws IOException { + if (innerTermCount == 0) { + collectInnerTerms(); + } + return innerTerms[innerTermCount - 1][2]; + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + if (innerTermCount == 0) { + collectInnerTerms(); + } + return new MatchesIterator() { + + int upto = -1; + + @Override + public boolean next() throws IOException { + upto++; + return upto < innerTermCount; + } + + @Override + public int startPosition() { + return innerTerms[upto][0]; + } + + @Override + public int endPosition() { + return innerTerms[upto][0]; + } + + @Override + public int startOffset() throws IOException { + return innerTerms[upto][1]; + } + + @Override + public int endOffset() throws IOException { + return innerTerms[upto][2]; + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + return MatchesIterator.EMPTY_ITERATOR; + } + + @Override + public Object label() { + return this; + } + }; + } + + @Override + public Object label() { + return SpanWeight.this; + } + + void collectInnerTerms() throws IOException { + termCollector.reset(); + spans.collect(termCollector); + Arrays.sort(innerTerms, 0, innerTermCount, Comparator.comparing(a -> a[0])); + } + }; + }); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java b/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java index 3855b04ad6b..86e43994ed7 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java @@ -18,8 +18,12 @@ package org.apache.lucene.search; import java.io.IOException; +import java.util.Arrays; import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.Objects; import java.util.Set; +import java.util.stream.Collectors; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -29,9 +33,14 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -99,7 +108,7 @@ public class TestMatchesIterator extends LuceneTestCase { "nothing matches this document" }; - void checkMatches(Query q, String field, int[][] expected) throws IOException { + private void checkMatches(Query q, String field, int[][] expected) throws IOException { Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); for (int i = 0; i < expected.length; i++) { LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(expected[i][0], searcher.leafContexts)); @@ -112,14 +121,40 @@ public class TestMatchesIterator extends LuceneTestCase { MatchesIterator it = matches.getMatches(field); if (expected[i].length == 1) { assertNull(it); - return; + continue; } checkFieldMatches(it, expected[i]); checkFieldMatches(matches.getMatches(field), expected[i]); // test multiple calls } } - void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException { + private void checkLabelCount(Query q, String field, int[] expected) throws IOException { + Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); + for (int i = 0; i < expected.length; i++) { + LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts)); + int doc = i - ctx.docBase; + Matches matches = w.matches(ctx, doc); + if (matches == null) { + assertEquals("Expected to get matches on document " + i, 0, expected[i]); + continue; + } + MatchesIterator it = matches.getMatches(field); + if (expected[i] == 0) { + assertNull(it); + continue; + } + else { + assertNotNull(it); + } + IdentityHashMap labels = new IdentityHashMap<>(); + while (it.next()) { + labels.put(it.label(), 1); + } + assertEquals(expected[i], labels.size()); + } + } + + private void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException { int pos = 1; while (it.next()) { //System.out.println(expected[i][pos] + "->" + expected[i][pos + 1] + "[" + expected[i][pos + 2] + "->" + expected[i][pos + 3] + "]"); @@ -132,7 +167,7 @@ public class TestMatchesIterator extends LuceneTestCase { assertEquals(expected.length, pos); } - void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException { + private void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException { Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); for (int i = 0; i < expected.length; i++) { LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts)); @@ -148,8 +183,90 @@ public class TestMatchesIterator extends LuceneTestCase { } } + private void checkTermMatches(Query q, String field, TermMatch[][][] expected) throws IOException { + Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); + for (int i = 0; i < expected.length; i++) { + LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts)); + int doc = i - ctx.docBase; + Matches matches = w.matches(ctx, doc); + if (matches == null) { + assertEquals(expected[i].length, 0); + continue; + } + MatchesIterator it = matches.getMatches(field); + if (expected[i].length == 0) { + assertNull(it); + continue; + } + checkTerms(expected[i], it); + } + } + + private void checkTerms(TermMatch[][] expected, MatchesIterator it) throws IOException { + int upTo = 0; + while (it.next()) { + Set expectedMatches = new HashSet<>(Arrays.asList(expected[upTo])); + MatchesIterator submatches = it.getSubMatches(); + while (submatches.next()) { + TermMatch tm = new TermMatch(submatches.startPosition(), submatches.startOffset(), submatches.endOffset()); + if (expectedMatches.remove(tm) == false) { + fail("Unexpected term match: " + tm); + } + } + if (expectedMatches.size() != 0) { + fail("Missing term matches: " + expectedMatches.stream().map(Object::toString).collect(Collectors.joining(", "))); + } + upTo++; + } + if (upTo < expected.length - 1) { + fail("Missing expected match"); + } + } + + static class TermMatch { + + public final int position; + + public final int startOffset; + + public final int endOffset; + + public TermMatch(PostingsEnum pe, int position) throws IOException { + this.position = position; + this.startOffset = pe.startOffset(); + this.endOffset = pe.endOffset(); + } + + public TermMatch(int position, int startOffset, int endOffset) { + this.position = position; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TermMatch termMatch = (TermMatch) o; + return position == termMatch.position && + startOffset == termMatch.startOffset && + endOffset == termMatch.endOffset; + } + + @Override + public int hashCode() { + return Objects.hash(position, startOffset, endOffset); + } + + @Override + public String toString() { + return position + "[" + startOffset + "->" + endOffset + "]"; + } + } + public void testTermQuery() throws IOException { - Query q = new TermQuery(new Term(FIELD_WITH_OFFSETS, "w1")); + Term t = new Term(FIELD_WITH_OFFSETS, "w1"); + Query q = new TermQuery(t); checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{ { 0, 0, 0, 0, 2 }, { 1, 0, 0, 0, 2 }, @@ -157,6 +274,14 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8 }, { 4 } }); + checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{ + { { new TermMatch(0, 0, 2) } }, + { { new TermMatch(0, 0, 2) } }, + { { new TermMatch(0, 0, 2) } }, + { { new TermMatch(0, 0, 2) }, { new TermMatch(2, 6, 8) } }, + {} + }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0, 0 }); } public void testTermQueryNoStoredOffsets() throws IOException { @@ -191,6 +316,7 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 }, { 4 } }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 1, 2, 0, 0 }); } public void testDisjunctionNoPositions() throws IOException { @@ -215,6 +341,7 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 }, { 4 } }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 0, 2, 0, 0 }); } public void testReqOptNoPositions() throws IOException { @@ -248,6 +375,7 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8, 3, 3, 9, 11, 5, 5, 15, 17 }, { 4 } }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 3, 1, 3, 3, 0, 0 }); } public void testMinShouldMatchNoPositions() throws IOException { @@ -331,6 +459,7 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 }, { 4 } }); + checkLabelCount(rq, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 2, 2, 0 }); } @@ -392,12 +521,55 @@ public class TestMatchesIterator extends LuceneTestCase { // 0 1 2 3 4 5 6 7 // "a phrase sentence with many phrase sentence iterations of a phrase sentence", + public void testSloppyPhraseQueryWithRepeats() throws IOException { + Term p = new Term(FIELD_WITH_OFFSETS, "phrase"); + Term s = new Term(FIELD_WITH_OFFSETS, "sentence"); + PhraseQuery pq = new PhraseQuery(10, FIELD_WITH_OFFSETS, "phrase", "sentence", "sentence"); + checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{ + { 0 }, { 1 }, { 2 }, { 3 }, + { 4, 1, 6, 2, 43, 2, 11, 9, 75, 5, 11, 28, 75, 6, 11, 35, 75 } + }); + checkLabelCount(pq, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 }); + checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{ + {}, {}, {}, {}, + { { + new TermMatch(1, 2, 8), + new TermMatch(2, 9, 17), + new TermMatch(6, 35, 43) + }, { + new TermMatch(5, 28, 34), + new TermMatch(2, 9, 17), + new TermMatch(11, 67, 75) + }, { + new TermMatch(5, 28, 34), + new TermMatch(6, 35, 43), + new TermMatch(11, 67, 75) + }, { + new TermMatch(10, 60, 66), + new TermMatch(6, 35, 43), + new TermMatch(11, 67, 75) + } } + }); + } + public void testSloppyPhraseQuery() throws IOException { + Term a = new Term(FIELD_WITH_OFFSETS, "a"); + Term s = new Term(FIELD_WITH_OFFSETS, "sentence"); PhraseQuery pq = new PhraseQuery(4, FIELD_WITH_OFFSETS, "a", "sentence"); checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{ { 0 }, { 1 }, { 2 }, { 3 }, { 4, 0, 2, 0, 17, 6, 9, 35, 59, 9, 11, 58, 75 } }); + checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{ + {}, {}, {}, {}, + { { + new TermMatch(0, 0, 1), new TermMatch(2, 9, 17) + }, { + new TermMatch(9, 58, 59), new TermMatch(6, 35, 43) + }, { + new TermMatch(9, 58, 59), new TermMatch(11, 67, 75) + } } + }); } public void testExactPhraseQuery() throws IOException { @@ -407,28 +579,57 @@ public class TestMatchesIterator extends LuceneTestCase { { 4, 1, 2, 2, 17, 5, 6, 28, 43, 10, 11, 60, 75 } }); + Term a = new Term(FIELD_WITH_OFFSETS, "a"); + Term s = new Term(FIELD_WITH_OFFSETS, "sentence"); PhraseQuery pq2 = new PhraseQuery.Builder() - .add(new Term(FIELD_WITH_OFFSETS, "a")) - .add(new Term(FIELD_WITH_OFFSETS, "sentence"), 2) + .add(a) + .add(s, 2) .build(); checkMatches(pq2, FIELD_WITH_OFFSETS, new int[][]{ { 0 }, { 1 }, { 2 }, { 3 }, { 4, 0, 2, 0, 17, 9, 11, 58, 75 } }); + checkTermMatches(pq2, FIELD_WITH_OFFSETS, new TermMatch[][][]{ + {}, {}, {}, {}, + { { + new TermMatch(0, 0, 1), new TermMatch(2, 9, 17) + }, { + new TermMatch(9, 58, 59), new TermMatch(11, 67, 75) + } } + }); } // 0 1 2 3 4 5 6 7 // "a phrase sentence with many phrase sentence iterations of a phrase sentence", public void testSloppyMultiPhraseQuery() throws IOException { + Term p = new Term(FIELD_WITH_OFFSETS, "phrase"); + Term s = new Term(FIELD_WITH_OFFSETS, "sentence"); + Term i = new Term(FIELD_WITH_OFFSETS, "iterations"); MultiPhraseQuery mpq = new MultiPhraseQuery.Builder() - .add(new Term(FIELD_WITH_OFFSETS, "phrase")) - .add(new Term[]{ new Term(FIELD_WITH_OFFSETS, "sentence"), new Term(FIELD_WITH_OFFSETS, "iterations") }) + .add(p) + .add(new Term[]{ s, i }) .setSlop(4) .build(); checkMatches(mpq, FIELD_WITH_OFFSETS, new int[][]{ { 0 }, { 1 }, { 2 }, { 3 }, - { 4, 1, 2, 2, 17, 5, 7, 28, 54, 5, 7, 28, 54, 10, 11, 60, 75 } + { 4, 1, 2, 2, 17, 5, 6, 28, 43, 5, 7, 28, 54, 10, 11, 60, 75 } + }); + checkTermMatches(mpq, FIELD_WITH_OFFSETS, new TermMatch[][][]{ + {}, {}, {}, {}, + { { + new TermMatch(1, 2, 8), + new TermMatch(2, 9, 17) + }, { + new TermMatch(5, 28, 34), + new TermMatch(6, 35, 43) + }, { + new TermMatch(5, 28, 34), + new TermMatch(7, 44, 54) + }, { + new TermMatch(10, 60, 66), + new TermMatch(11, 67, 75) + } } }); } @@ -452,4 +653,35 @@ public class TestMatchesIterator extends LuceneTestCase { }); } + // 0 1 2 3 4 5 6 7 + // "a phrase sentence with many phrase sentence iterations of a phrase sentence", + + public void testSpanQuery() throws IOException { + SpanQuery subq = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS) + .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "with"))) + .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "many"))) + .build(); + Query q = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS) + .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "sentence"))) + .addClause(new SpanOrQuery(subq, new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "iterations")))) + .build(); + checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{ + { 0 }, { 1 }, { 2 }, { 3 }, + { 4, 2, 4, 9, 27, 6, 7, 35, 54 } + }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 }); + checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{ + {}, {}, {}, {}, + { + { + new TermMatch(2, 9, 17), + new TermMatch(3, 18, 22), + new TermMatch(4, 23, 27) + }, { + new TermMatch(6, 35, 43), new TermMatch(7, 44, 54) + } + } + }); + } + } diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java index 4f065129ec3..5f7e3070040 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java @@ -67,4 +67,15 @@ class AssertingMatchesIterator implements MatchesIterator { return in.endOffset(); } + @Override + public MatchesIterator getSubMatches() throws IOException { + assert state == State.ITERATING : state; + return in.getSubMatches(); + } + + @Override + public Object label() { + assert state == State.ITERATING : state; + return in.label(); + } }