diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java index 975199bd141..f28c0511e3f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java @@ -45,14 +45,14 @@ final class DisjunctionMatchesIterator implements MatchesIterator { * * Only terms that have at least one match in the given document will be included */ - static MatchesIterator fromTerms(LeafReaderContext context, int doc, String field, List terms) throws IOException { + static MatchesIterator fromTerms(LeafReaderContext context, int doc, Query query, String field, List terms) throws IOException { Objects.requireNonNull(field); for (Term term : terms) { if (Objects.equals(field, term.field()) == false) { throw new IllegalArgumentException("Tried to generate iterator from terms in multiple fields: expected [" + field + "] but got [" + term.field() + "]"); } } - return fromTermsEnum(context, doc, field, asBytesRefIterator(terms)); + return fromTermsEnum(context, doc, query, field, asBytesRefIterator(terms)); } private static BytesRefIterator asBytesRefIterator(List terms) { @@ -72,7 +72,7 @@ final class DisjunctionMatchesIterator implements MatchesIterator { * * Only terms that have at least one match in the given document will be included */ - static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, String field, BytesRefIterator terms) throws IOException { + static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException { Objects.requireNonNull(field); List mis = new ArrayList<>(); Terms t = context.reader().terms(field); @@ -84,7 +84,7 @@ final class DisjunctionMatchesIterator implements MatchesIterator { if (te.seekExact(term)) { PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS); if (pe.advance(doc) == doc) { - mis.add(new TermMatchesIterator(pe)); + mis.add(new TermMatchesIterator(query, pe)); reuse = null; } else { @@ -158,4 +158,13 @@ final class DisjunctionMatchesIterator implements MatchesIterator { return queue.top().endOffset(); } + @Override + public MatchesIterator getSubMatches() throws IOException { + return queue.top().getSubMatches(); + } + + @Override + public Query getQuery() { + return queue.top().getQuery(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java index 99720b7e586..fe6d551a008 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java @@ -78,7 +78,7 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod { @Override public Matches matches(LeafReaderContext context, int doc) throws IOException { final SortedSetDocValues fcsi = DocValues.getSortedSet(context.reader(), query.field); - return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query.field, getTermsEnum(fcsi))); + return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query, query.field, getTermsEnum(fcsi))); } private TermsEnum getTermsEnum(SortedSetDocValues fcsi) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java index 450a352720e..47ab5f57fbf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java @@ -28,6 +28,9 @@ import org.apache.lucene.index.LeafReaderContext; * positions and/or offsets after each call. You should not call the position or offset methods * before {@link #next()} has been called, or after {@link #next()} has returned {@code false}. * + * Matches from some queries may span multiple positions. You can retrieve the positions of + * individual matching terms on the current match by calling {@link #getSubMatches()}. + * * Matches are ordered by start position, and then by end position. Match intervals may overlap. * * @see Weight#matches(LeafReaderContext, int) @@ -70,4 +73,25 @@ public interface MatchesIterator { */ int endOffset() throws IOException; + /** + * Returns a MatchesIterator that iterates over the positions and offsets of individual + * terms within the current match + * + * Returns {@code null} if there are no submatches (ie the current iterator is at the + * leaf level) + * + * Should only be called after {@link #next()} has returned {@code true} + */ + MatchesIterator getSubMatches() throws IOException; + + /** + * Returns the Query causing the current match + * + * If this {@link MatchesIterator} has been returned from a {@link #getSubMatches()} + * call, then returns a {@link TermQuery} equivalent to the current match + * + * Should only be called after {@link #next()} has returned {@code true} + */ + Query getQuery(); + } diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 22b71279012..c8d22baec91 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -269,7 +269,7 @@ public class MultiPhraseQuery extends Query { TermState termState = termStates.get(term).get(context); if (termState != null) { termsEnum.seekExact(term.bytes(), termState); - postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS)); + postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS)); totalMatchCost += PhraseQuery.termPositionsCost(termsEnum); } } @@ -294,7 +294,7 @@ public class MultiPhraseQuery extends Query { return new ExactPhraseMatcher(postingsFreqs, totalMatchCost); } else { - return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost); + return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets); } } @@ -647,5 +647,6 @@ public class MultiPhraseQuery extends Query { public BytesRef getPayload() throws IOException { return posQueue.top().pe.getPayload(); } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java index 22997762119..1e59ef90cc0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java @@ -211,7 +211,7 @@ final class MultiTermQueryConstantScoreWrapper extends if (terms.hasPositions() == false) { return super.matches(context, doc); } - return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query.field, query.getTermsEnum(terms))); + return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query, query.field, query.getTermsEnum(terms))); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java b/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java index 81040d517c4..a3cbd344fda 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseMatcher.java @@ -88,4 +88,5 @@ abstract class PhraseMatcher { public float getMatchCost() { return matchCost; } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 70d2e09ef3b..8f042716a65 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -446,7 +446,7 @@ public class PhraseQuery extends Query { return null; } te.seekExact(t.bytes(), state); - PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS); + PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS); postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t); totalMatchCost += termPositionsCost(te); } @@ -457,7 +457,7 @@ public class PhraseQuery extends Query { return new ExactPhraseMatcher(postingsFreqs, totalMatchCost); } else { - return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost); + return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java b/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java index 90fa537c366..94e57d64fbc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseWeight.java @@ -123,6 +123,16 @@ abstract class PhraseWeight extends Weight { public int endOffset() throws IOException { return matcher.endOffset(); } + + @Override + public MatchesIterator getSubMatches() throws IOException { + return null; // phrases are treated as leaves + } + + @Override + public Query getQuery() { + return PhraseWeight.this.getQuery(); + } }; }); } diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java index 326816d1428..bfe0682dd74 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseMatcher.java @@ -54,13 +54,14 @@ final class SloppyPhraseMatcher extends PhraseMatcher { private final int slop; private final int numPostings; private final PhraseQueue pq; // for advancing min position + private final boolean captureLeadMatch; private int end; // current largest phrase position private int leadPosition; private int leadOffset; - private int currentEndPostings; - private int advanceEndPostings; + private int leadEndOffset; + private int leadOrd; private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc) private boolean checkedRpts; // flag to only check for repetitions in first candidate doc @@ -71,10 +72,11 @@ final class SloppyPhraseMatcher extends PhraseMatcher { private boolean positioned; private int matchLength; - SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost) { + SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) { super(approximation(postings), matchCost); this.slop = slop; this.numPostings = postings.length; + this.captureLeadMatch = captureLeadMatch; pq = new PhraseQueue(postings.length); phrasePositions = new PhrasePositions[postings.length]; for (int i = 0; i < postings.length; ++i) { @@ -120,10 +122,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher { return false; } PhrasePositions pp = pq.pop(); - assert pp != null; // if the pq is empty, then positioned == false - leadPosition = pp.position + pp.offset; - leadOffset = pp.postings.startOffset(); - currentEndPostings = advanceEndPostings; + assert pp != null; // if the pq is not full, then positioned == false + captureLead(pp); matchLength = end - pp.position; int next = pq.top().position; while (advancePP(pp)) { @@ -137,6 +137,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher { } pp = pq.pop(); next = pq.top().position; + assert pp != null; // if the pq is not full, then positioned == false matchLength = end - pp.position; } else { int matchLength2 = end - pp.position; @@ -144,14 +145,22 @@ final class SloppyPhraseMatcher extends PhraseMatcher { matchLength = matchLength2; } } - leadPosition = pp.position + pp.offset; - leadOffset = pp.postings.startOffset(); - currentEndPostings = advanceEndPostings; + captureLead(pp); } positioned = false; return matchLength <= slop; } + private void captureLead(PhrasePositions pp) throws IOException { + if (captureLeadMatch == false) { + return; + } + leadOrd = pp.ord; + leadPosition = pp.position + pp.offset; + leadOffset = pp.postings.startOffset(); + leadEndOffset = pp.postings.endOffset(); + } + @Override public int startPosition() { // when a match is detected, the top postings is advanced until it has moved @@ -160,6 +169,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher { // However, the priority queue doesn't guarantee that the top postings is in fact the // earliest in the list, so we need to cycle through all terms to check. // this is slow, but Matches is slow anyway... + int leadPosition = this.leadPosition; for (PhrasePositions pp : phrasePositions) { leadPosition = Math.min(leadPosition, pp.position + pp.offset); } @@ -168,7 +178,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher { @Override public int endPosition() { - return phrasePositions[currentEndPostings].position + phrasePositions[currentEndPostings].offset; + int endPosition = leadPosition; + for (PhrasePositions pp : phrasePositions) { + if (pp.ord != leadOrd) { + endPosition = Math.max(endPosition, pp.position + pp.offset); + } + } + return endPosition; } @Override @@ -179,6 +195,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher { // However, the priority queue doesn't guarantee that the top postings is in fact the // earliest in the list, so we need to cycle through all terms to check // this is slow, but Matches is slow anyway... + int leadOffset = this.leadOffset; for (PhrasePositions pp : phrasePositions) { leadOffset = Math.min(leadOffset, pp.postings.startOffset()); } @@ -187,7 +204,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher { @Override public int endOffset() throws IOException { - return phrasePositions[currentEndPostings].postings.endOffset(); + int endOffset = leadEndOffset; + for (PhrasePositions pp : phrasePositions) { + if (pp.ord != leadOrd) { + endOffset = Math.max(endOffset, pp.postings.endOffset()); + } + } + return endOffset; } /** advance a PhrasePosition and update 'end', return false if exhausted */ @@ -197,12 +220,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher { } if (pp.position > end) { end = pp.position; - advanceEndPostings = pp.ord; - } - if (pp.position == end) { - if (pp.ord > advanceEndPostings) { - advanceEndPostings = pp.ord; - } } return true; } @@ -307,12 +324,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher { pp.firstPosition(); if (pp.position > end) { end = pp.position; - advanceEndPostings = pp.ord; - } - if (pp.position == end) { - if (pp.ord > advanceEndPostings) { - advanceEndPostings = pp.ord; - } } pq.add(pp); } @@ -342,12 +353,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher { for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max if (pp.position > end) { end = pp.position; - advanceEndPostings = pp.ord; - } - if (pp.position == end) { - if (pp.ord > advanceEndPostings) { - advanceEndPostings = pp.ord; - } } pq.add(pp); } diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index 68453ac014d..97e127ad232 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -176,7 +176,7 @@ public final class SynonymQuery extends Query { if (terms == null || terms.hasPositions() == false) { return super.matches(context, doc); } - return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTerms(context, doc, field, Arrays.asList(SynonymQuery.this.terms))); + return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTerms(context, doc, getQuery(), field, Arrays.asList(SynonymQuery.this.terms))); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 7145a83db30..3ec7380cb8b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -226,7 +226,7 @@ public class TermInSetQuery extends Query implements Accountable { if (terms == null || terms.hasPositions() == false) { return super.matches(context, doc); } - return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, field, termData.iterator())); + return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, getQuery(), field, termData.iterator())); } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java index defc3af5751..968b8154a69 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermMatchesIterator.java @@ -29,12 +29,14 @@ class TermMatchesIterator implements MatchesIterator { private int upto; private int pos; private final PostingsEnum pe; + private final Query query; /** * Create a new {@link TermMatchesIterator} for the given term and postings list */ - TermMatchesIterator(PostingsEnum pe) throws IOException { + TermMatchesIterator(Query query, PostingsEnum pe) throws IOException { this.pe = pe; + this.query = query; this.upto = pe.freq(); } @@ -67,4 +69,13 @@ class TermMatchesIterator implements MatchesIterator { return pe.endOffset(); } + @Override + public MatchesIterator getSubMatches() throws IOException { + return null; + } + + @Override + public Query getQuery() { + return query; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index ccb1de670e1..46901f2befd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -94,7 +94,7 @@ public class TermQuery extends Query { if (pe.advance(doc) != doc) { return null; } - return new TermMatchesIterator(pe); + return new TermMatchesIterator(getQuery(), pe); }); } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java index 0313d56510c..edde8bb0ba9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -18,6 +18,8 @@ package org.apache.lucene.search.spans; import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; @@ -28,6 +30,10 @@ import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafSimScorer; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity; @@ -161,4 +167,138 @@ public abstract class SpanWeight extends Weight { return Explanation.noMatch("no matching term"); } + + private static class TermMatch { + Term term; + int position; + int startOffset; + int endOffset; + } + + @Override + public Matches matches(LeafReaderContext context, int doc) throws IOException { + return Matches.forField(field, () -> { + Spans spans = getSpans(context, Postings.OFFSETS); + if (spans == null || spans.advance(doc) != doc) { + return null; + } + return new MatchesIterator() { + + int innerTermCount = 0; + TermMatch[] innerTerms = new TermMatch[0]; + + SpanCollector termCollector = new SpanCollector() { + @Override + public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException { + innerTermCount++; + if (innerTermCount > innerTerms.length) { + TermMatch[] temp = new TermMatch[innerTermCount]; + System.arraycopy(innerTerms, 0, temp, 0, innerTermCount - 1); + innerTerms = temp; + innerTerms[innerTermCount - 1] = new TermMatch(); + } + innerTerms[innerTermCount - 1].term = term; + innerTerms[innerTermCount - 1].position = position; + innerTerms[innerTermCount - 1].startOffset = postings.startOffset(); + innerTerms[innerTermCount - 1].endOffset = postings.endOffset(); + } + + @Override + public void reset() { + innerTermCount = 0; + } + }; + + @Override + public boolean next() throws IOException { + innerTermCount = 0; + return spans.nextStartPosition() != Spans.NO_MORE_POSITIONS; + } + + @Override + public int startPosition() { + return spans.startPosition(); + } + + @Override + public int endPosition() { + return spans.endPosition() - 1; + } + + @Override + public int startOffset() throws IOException { + if (innerTermCount == 0) { + collectInnerTerms(); + } + return innerTerms[0].startOffset; + } + + @Override + public int endOffset() throws IOException { + if (innerTermCount == 0) { + collectInnerTerms(); + } + return innerTerms[innerTermCount - 1].endOffset; + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + if (innerTermCount == 0) { + collectInnerTerms(); + } + return new MatchesIterator() { + + int upto = -1; + + @Override + public boolean next() throws IOException { + upto++; + return upto < innerTermCount; + } + + @Override + public int startPosition() { + return innerTerms[upto].position; + } + + @Override + public int endPosition() { + return innerTerms[upto].position; + } + + @Override + public int startOffset() throws IOException { + return innerTerms[upto].startOffset; + } + + @Override + public int endOffset() throws IOException { + return innerTerms[upto].endOffset; + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + return null; + } + + @Override + public Query getQuery() { + return new TermQuery(innerTerms[upto].term); + } + }; + } + + @Override + public Query getQuery() { + return SpanWeight.this.getQuery(); + } + + void collectInnerTerms() throws IOException { + termCollector.reset(); + spans.collect(termCollector); + Arrays.sort(innerTerms, 0, innerTermCount, Comparator.comparing(a -> a.position)); + } + }; + }); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java b/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java index 3855b04ad6b..e77fca6d1d9 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMatchesIterator.java @@ -18,8 +18,12 @@ package org.apache.lucene.search; import java.io.IOException; +import java.util.Arrays; import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.Objects; import java.util.Set; +import java.util.stream.Collectors; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -29,9 +33,14 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -99,7 +108,7 @@ public class TestMatchesIterator extends LuceneTestCase { "nothing matches this document" }; - void checkMatches(Query q, String field, int[][] expected) throws IOException { + private void checkMatches(Query q, String field, int[][] expected) throws IOException { Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); for (int i = 0; i < expected.length; i++) { LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(expected[i][0], searcher.leafContexts)); @@ -112,14 +121,40 @@ public class TestMatchesIterator extends LuceneTestCase { MatchesIterator it = matches.getMatches(field); if (expected[i].length == 1) { assertNull(it); - return; + continue; } checkFieldMatches(it, expected[i]); checkFieldMatches(matches.getMatches(field), expected[i]); // test multiple calls } } - void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException { + private void checkLabelCount(Query q, String field, int[] expected) throws IOException { + Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); + for (int i = 0; i < expected.length; i++) { + LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts)); + int doc = i - ctx.docBase; + Matches matches = w.matches(ctx, doc); + if (matches == null) { + assertEquals("Expected to get matches on document " + i, 0, expected[i]); + continue; + } + MatchesIterator it = matches.getMatches(field); + if (expected[i] == 0) { + assertNull(it); + continue; + } + else { + assertNotNull(it); + } + IdentityHashMap labels = new IdentityHashMap<>(); + while (it.next()) { + labels.put(it.getQuery(), 1); + } + assertEquals(expected[i], labels.size()); + } + } + + private void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException { int pos = 1; while (it.next()) { //System.out.println(expected[i][pos] + "->" + expected[i][pos + 1] + "[" + expected[i][pos + 2] + "->" + expected[i][pos + 3] + "]"); @@ -132,7 +167,7 @@ public class TestMatchesIterator extends LuceneTestCase { assertEquals(expected.length, pos); } - void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException { + private void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException { Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); for (int i = 0; i < expected.length; i++) { LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts)); @@ -148,8 +183,109 @@ public class TestMatchesIterator extends LuceneTestCase { } } + private void assertIsLeafMatch(Query q, String field) throws IOException { + Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); + for (int i = 0; i < searcher.reader.maxDoc(); i++) { + LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts)); + int doc = i - ctx.docBase; + Matches matches = w.matches(ctx, doc); + if (matches == null) { + return; + } + MatchesIterator mi = matches.getMatches(field); + if (mi == null) { + return; + } + while (mi.next()) { + assertNull(mi.getSubMatches()); + } + } + } + + private void checkTermMatches(Query q, String field, TermMatch[][][] expected) throws IOException { + Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1); + for (int i = 0; i < expected.length; i++) { + LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts)); + int doc = i - ctx.docBase; + Matches matches = w.matches(ctx, doc); + if (matches == null) { + assertEquals(expected[i].length, 0); + continue; + } + MatchesIterator it = matches.getMatches(field); + if (expected[i].length == 0) { + assertNull(it); + continue; + } + checkTerms(expected[i], it); + } + } + + private void checkTerms(TermMatch[][] expected, MatchesIterator it) throws IOException { + int upTo = 0; + while (it.next()) { + Set expectedMatches = new HashSet<>(Arrays.asList(expected[upTo])); + MatchesIterator submatches = it.getSubMatches(); + while (submatches.next()) { + TermMatch tm = new TermMatch(submatches.startPosition(), submatches.startOffset(), submatches.endOffset()); + if (expectedMatches.remove(tm) == false) { + fail("Unexpected term match: " + tm); + } + } + if (expectedMatches.size() != 0) { + fail("Missing term matches: " + expectedMatches.stream().map(Object::toString).collect(Collectors.joining(", "))); + } + upTo++; + } + if (upTo < expected.length - 1) { + fail("Missing expected match"); + } + } + + static class TermMatch { + + public final int position; + + public final int startOffset; + + public final int endOffset; + + public TermMatch(PostingsEnum pe, int position) throws IOException { + this.position = position; + this.startOffset = pe.startOffset(); + this.endOffset = pe.endOffset(); + } + + public TermMatch(int position, int startOffset, int endOffset) { + this.position = position; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TermMatch termMatch = (TermMatch) o; + return position == termMatch.position && + startOffset == termMatch.startOffset && + endOffset == termMatch.endOffset; + } + + @Override + public int hashCode() { + return Objects.hash(position, startOffset, endOffset); + } + + @Override + public String toString() { + return position + "[" + startOffset + "->" + endOffset + "]"; + } + } + public void testTermQuery() throws IOException { - Query q = new TermQuery(new Term(FIELD_WITH_OFFSETS, "w1")); + Term t = new Term(FIELD_WITH_OFFSETS, "w1"); + Query q = new TermQuery(t); checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{ { 0, 0, 0, 0, 2 }, { 1, 0, 0, 0, 2 }, @@ -157,6 +293,8 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8 }, { 4 } }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0, 0 }); + assertIsLeafMatch(q, FIELD_WITH_OFFSETS); } public void testTermQueryNoStoredOffsets() throws IOException { @@ -191,6 +329,8 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 }, { 4 } }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 1, 2, 0, 0 }); + assertIsLeafMatch(q, FIELD_WITH_OFFSETS); } public void testDisjunctionNoPositions() throws IOException { @@ -215,6 +355,7 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 }, { 4 } }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 0, 2, 0, 0 }); } public void testReqOptNoPositions() throws IOException { @@ -248,6 +389,8 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 2, 2, 6, 8, 3, 3, 9, 11, 5, 5, 15, 17 }, { 4 } }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 3, 1, 3, 3, 0, 0 }); + assertIsLeafMatch(q, FIELD_WITH_OFFSETS); } public void testMinShouldMatchNoPositions() throws IOException { @@ -331,6 +474,8 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 }, { 4 } }); + checkLabelCount(rq, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0 }); + assertIsLeafMatch(rq, FIELD_WITH_OFFSETS); } @@ -357,6 +502,7 @@ public class TestMatchesIterator extends LuceneTestCase { { 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 }, { 4 } }); + assertIsLeafMatch(q, FIELD_WITH_OFFSETS); } public void testSynonymQueryNoPositions() throws IOException { @@ -392,12 +538,25 @@ public class TestMatchesIterator extends LuceneTestCase { // 0 1 2 3 4 5 6 7 // "a phrase sentence with many phrase sentence iterations of a phrase sentence", + public void testSloppyPhraseQueryWithRepeats() throws IOException { + Term p = new Term(FIELD_WITH_OFFSETS, "phrase"); + Term s = new Term(FIELD_WITH_OFFSETS, "sentence"); + PhraseQuery pq = new PhraseQuery(10, FIELD_WITH_OFFSETS, "phrase", "sentence", "sentence"); + checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{ + { 0 }, { 1 }, { 2 }, { 3 }, + { 4, 1, 6, 2, 43, 2, 11, 9, 75, 5, 11, 28, 75, 6, 11, 35, 75 } + }); + checkLabelCount(pq, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 }); + assertIsLeafMatch(pq, FIELD_WITH_OFFSETS); + } + public void testSloppyPhraseQuery() throws IOException { PhraseQuery pq = new PhraseQuery(4, FIELD_WITH_OFFSETS, "a", "sentence"); checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{ { 0 }, { 1 }, { 2 }, { 3 }, { 4, 0, 2, 0, 17, 6, 9, 35, 59, 9, 11, 58, 75 } }); + assertIsLeafMatch(pq, FIELD_WITH_OFFSETS); } public void testExactPhraseQuery() throws IOException { @@ -407,29 +566,36 @@ public class TestMatchesIterator extends LuceneTestCase { { 4, 1, 2, 2, 17, 5, 6, 28, 43, 10, 11, 60, 75 } }); + Term a = new Term(FIELD_WITH_OFFSETS, "a"); + Term s = new Term(FIELD_WITH_OFFSETS, "sentence"); PhraseQuery pq2 = new PhraseQuery.Builder() - .add(new Term(FIELD_WITH_OFFSETS, "a")) - .add(new Term(FIELD_WITH_OFFSETS, "sentence"), 2) + .add(a) + .add(s, 2) .build(); checkMatches(pq2, FIELD_WITH_OFFSETS, new int[][]{ { 0 }, { 1 }, { 2 }, { 3 }, { 4, 0, 2, 0, 17, 9, 11, 58, 75 } }); + assertIsLeafMatch(pq2, FIELD_WITH_OFFSETS); } // 0 1 2 3 4 5 6 7 // "a phrase sentence with many phrase sentence iterations of a phrase sentence", public void testSloppyMultiPhraseQuery() throws IOException { + Term p = new Term(FIELD_WITH_OFFSETS, "phrase"); + Term s = new Term(FIELD_WITH_OFFSETS, "sentence"); + Term i = new Term(FIELD_WITH_OFFSETS, "iterations"); MultiPhraseQuery mpq = new MultiPhraseQuery.Builder() - .add(new Term(FIELD_WITH_OFFSETS, "phrase")) - .add(new Term[]{ new Term(FIELD_WITH_OFFSETS, "sentence"), new Term(FIELD_WITH_OFFSETS, "iterations") }) + .add(p) + .add(new Term[]{ s, i }) .setSlop(4) .build(); checkMatches(mpq, FIELD_WITH_OFFSETS, new int[][]{ { 0 }, { 1 }, { 2 }, { 3 }, - { 4, 1, 2, 2, 17, 5, 7, 28, 54, 5, 7, 28, 54, 10, 11, 60, 75 } + { 4, 1, 2, 2, 17, 5, 6, 28, 43, 5, 7, 28, 54, 10, 11, 60, 75 } }); + assertIsLeafMatch(mpq, FIELD_WITH_OFFSETS); } public void testExactMultiPhraseQuery() throws IOException { @@ -450,6 +616,38 @@ public class TestMatchesIterator extends LuceneTestCase { { 0 }, { 1 }, { 2 }, { 3 }, { 4, 0, 1, 0, 8, 4, 5, 23, 34, 9, 10, 58, 66 } }); + assertIsLeafMatch(mpq2, FIELD_WITH_OFFSETS); + } + + // 0 1 2 3 4 5 6 7 + // "a phrase sentence with many phrase sentence iterations of a phrase sentence", + + public void testSpanQuery() throws IOException { + SpanQuery subq = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS) + .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "with"))) + .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "many"))) + .build(); + Query q = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS) + .addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "sentence"))) + .addClause(new SpanOrQuery(subq, new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "iterations")))) + .build(); + checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{ + { 0 }, { 1 }, { 2 }, { 3 }, + { 4, 2, 4, 9, 27, 6, 7, 35, 54 } + }); + checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 }); + checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{ + {}, {}, {}, {}, + { + { + new TermMatch(2, 9, 17), + new TermMatch(3, 18, 22), + new TermMatch(4, 23, 27) + }, { + new TermMatch(6, 35, 43), new TermMatch(7, 44, 54) + } + } + }); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java index 4f065129ec3..36a56338a19 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingMatchesIterator.java @@ -67,4 +67,15 @@ class AssertingMatchesIterator implements MatchesIterator { return in.endOffset(); } + @Override + public MatchesIterator getSubMatches() throws IOException { + assert state == State.ITERATING : state; + return in.getSubMatches(); + } + + @Override + public Query getQuery() { + assert state == State.ITERATING : state; + return in.getQuery(); + } }