From 7d34bfdaad37a25bfda5df2bf2c2bd1a31540bc2 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 7 Jan 2019 13:33:59 +0000 Subject: [PATCH] LUCENE-8629: New interval functions: before, after, within, overlapping --- lucene/CHANGES.txt | 11 +- .../search/intervals/IntervalFunction.java | 119 ++++++------ .../lucene/search/intervals/Intervals.java | 41 ++++- .../intervals/OffsetIntervalsSource.java | 172 ++++++++++++++++++ .../search/intervals/TestIntervals.java | 103 +++++++++++ 5 files changed, 389 insertions(+), 57 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2f29d6e5801..48f9aa5ae8a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -108,10 +108,6 @@ API Changes * LUCENE-8609: Remove IndexWriter#numDocs() and IndexWriter#maxDoc() in favor of IndexWriter#getDocStats(). (Simon Willnauer) -* LUCENE-8612: Intervals.extend() treats an interval as if it covered a wider - span than it actually does, allowing users to force minimum gaps between - intervals in a phrase. (Alan Woodward) - Changes in Runtime Behavior * LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of @@ -177,6 +173,13 @@ New Features to read-ahead in the token stream and take into account graph structures. This also changes FixedShingleFilter to extend GraphTokenFilter (Alan Woodward) +* LUCENE-8612: Intervals.extend() treats an interval as if it covered a wider + span than it actually does, allowing users to force minimum gaps between + intervals in a phrase. (Alan Woodward) + +* LUCENE-8629: New interval functions: Intervals.before(), Intervals.after(), + Intervals.within() and Intervals.overlapping(). (Alan Woodward) + Improvements * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities. diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java index 9460d8dd657..862e9a86a56 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java @@ -353,25 +353,7 @@ abstract class IntervalFunction { throw new IllegalStateException("CONTAINING function requires two iterators"); IntervalIterator a = iterators.get(0); IntervalIterator b = iterators.get(1); - return new ConjunctionIntervalIterator(iterators) { - - boolean bpos; - - @Override - public int start() { - return a.start(); - } - - @Override - public int end() { - return a.end(); - } - - @Override - public int gaps() { - return a.gaps(); - } - + return new FilteringIntervalIterator(a, b) { @Override public int nextInterval() throws IOException { if (bpos == false) @@ -386,11 +368,6 @@ abstract class IntervalFunction { } return IntervalIterator.NO_MORE_INTERVALS; } - - @Override - protected void reset() throws IOException { - bpos = b.nextInterval() != NO_MORE_INTERVALS; - } }; } }; @@ -405,31 +382,7 @@ abstract class IntervalFunction { throw new IllegalStateException("CONTAINED_BY function requires two iterators"); IntervalIterator a = iterators.get(0); IntervalIterator b = iterators.get(1); - return new ConjunctionIntervalIterator(iterators) { - - boolean bpos; - - @Override - public int start() { - if (bpos == false) { - return NO_MORE_INTERVALS; - } - return a.start(); - } - - @Override - public int end() { - if (bpos == false) { - return NO_MORE_INTERVALS; - } - return a.end(); - } - - @Override - public int gaps() { - return a.gaps(); - } - + return new FilteringIntervalIterator(a, b) { @Override public int nextInterval() throws IOException { if (bpos == false) @@ -447,15 +400,79 @@ abstract class IntervalFunction { bpos = false; return IntervalIterator.NO_MORE_INTERVALS; } + }; + } + }; + static final IntervalFunction OVERLAPPING = new SingletonFunction("OVERLAPPING") { + @Override + public IntervalIterator apply(List iterators) { + if (iterators.size() != 2) + throw new IllegalStateException("OVERLAPPING function requires two iterators"); + IntervalIterator a = iterators.get(0); + IntervalIterator b = iterators.get(1); + return new FilteringIntervalIterator(a, b) { @Override - protected void reset() throws IOException { - bpos = b.nextInterval() != NO_MORE_INTERVALS; + public int nextInterval() throws IOException { + if (bpos == false) + return IntervalIterator.NO_MORE_INTERVALS; + while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + while (b.end() < a.start()) { + if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + bpos = false; + return IntervalIterator.NO_MORE_INTERVALS; + } + } + if (b.start() <= a.end()) + return a.start(); + } + bpos = false; + return IntervalIterator.NO_MORE_INTERVALS; } }; } }; + private static abstract class FilteringIntervalIterator extends ConjunctionIntervalIterator { + + final IntervalIterator a; + final IntervalIterator b; + + boolean bpos; + + protected FilteringIntervalIterator(IntervalIterator a, IntervalIterator b) { + super(Arrays.asList(a, b)); + this.a = a; + this.b = b; + } + + @Override + public int start() { + if (bpos == false) { + return NO_MORE_INTERVALS; + } + return a.start(); + } + + @Override + public int end() { + if (bpos == false) { + return NO_MORE_INTERVALS; + } + return a.end(); + } + + @Override + public int gaps() { + return a.gaps(); + } + + @Override + protected void reset() throws IOException { + bpos = b.nextInterval() != NO_MORE_INTERVALS; + } + } + private static abstract class SingletonFunction extends IntervalFunction { private final String name; diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java index a98adbdc7a3..aa5b62c850c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java @@ -171,6 +171,15 @@ public final class Intervals { return new DifferenceIntervalsSource(minuend, subtrahend, DifferenceIntervalFunction.NON_OVERLAPPING); } + /** + * Returns intervals from a source that overlap with intervals from another source + * @param source the source to filter + * @param reference the source to filter by + */ + public static IntervalsSource overlapping(IntervalsSource source, IntervalsSource reference) { + return new ConjunctionIntervalsSource(Arrays.asList(source, reference), IntervalFunction.OVERLAPPING); + } + /** * Create a not-within {@link IntervalsSource} * @@ -178,7 +187,7 @@ public final class Intervals { * intervals from the subtrahend query * * @param minuend the {@link IntervalsSource} to filter - * @param positions the maximum distance that intervals from the minuend may occur from intervals + * @param positions the minimum distance that intervals from the minuend may occur from intervals * of the subtrahend * @param subtrahend the {@link IntervalsSource} to filter by */ @@ -187,6 +196,18 @@ public final class Intervals { DifferenceIntervalFunction.NON_OVERLAPPING); } + /** + * Returns intervals of the source that appear within a set number of positions of intervals from + * the reference + * + * @param source the {@link IntervalsSource} to filter + * @param positions the maximum distance that intervals of the source may occur from intervals of the reference + * @param reference the {@link IntervalsSource} to filter by + */ + public static IntervalsSource within(IntervalsSource source, int positions, IntervalsSource reference) { + return containedBy(source, Intervals.extend(reference, positions, positions)); + } + /** * Create a not-containing {@link IntervalsSource} * @@ -237,6 +258,22 @@ public final class Intervals { return new ConjunctionIntervalsSource(Arrays.asList(small, big), IntervalFunction.CONTAINED_BY); } - // TODO: beforeQuery, afterQuery, arbitrary IntervalFunctions + /** + * Returns intervals from the source that appear before intervals from the reference + */ + public static IntervalsSource before(IntervalsSource source, IntervalsSource reference) { + return new ConjunctionIntervalsSource(Arrays.asList(source, + Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0)), + IntervalFunction.CONTAINED_BY); + } + + /** + * Returns intervals from the source that appear after intervals from the reference + */ + public static IntervalsSource after(IntervalsSource source, IntervalsSource reference) { + return new ConjunctionIntervalsSource(Arrays.asList(source, + Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE)), + IntervalFunction.CONTAINED_BY); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java new file mode 100644 index 00000000000..470e2b539f6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Objects; +import java.util.Set; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.MatchesIterator; + +/** + * Tracks a reference intervals source, and produces a pseudo-interval that appears + * either one position before or one position after each interval from the reference + */ +class OffsetIntervalsSource extends IntervalsSource { + + private final IntervalsSource in; + private final boolean before; + + OffsetIntervalsSource(IntervalsSource in, boolean before) { + this.in = in; + this.before = before; + } + + @Override + public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { + IntervalIterator it = in.intervals(field, ctx); + if (it == null) { + return null; + } + return offset(it); + } + + private IntervalIterator offset(IntervalIterator it) { + if (before) { + return new OffsetIntervalIterator(it) { + @Override + public int start() { + int pos = in.start(); + if (pos == -1) { + return -1; + } + if (pos == NO_MORE_INTERVALS) { + return NO_MORE_INTERVALS; + } + return Math.max(0, pos - 1); + } + }; + } + else { + return new OffsetIntervalIterator(it) { + @Override + public int start() { + int pos = in.end() + 1; + if (pos == 0) { + return -1; + } + if (pos < 0) { // overflow + return Integer.MAX_VALUE; + } + if (pos == Integer.MAX_VALUE) { + return Integer.MAX_VALUE - 1; + } + return pos; + } + }; + } + } + + private static abstract class OffsetIntervalIterator extends IntervalIterator { + + final IntervalIterator in; + + OffsetIntervalIterator(IntervalIterator in) { + this.in = in; + } + + @Override + public int end() { + return start(); + } + + @Override + public int gaps() { + return 0; + } + + @Override + public int nextInterval() throws IOException { + in.nextInterval(); + return start(); + } + + @Override + public float matchCost() { + return in.matchCost(); + } + + @Override + public int docID() { + return in.docID(); + } + + @Override + public int nextDoc() throws IOException { + return in.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return in.advance(target); + } + + @Override + public long cost() { + return in.cost(); + } + } + + @Override + public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + MatchesIterator mi = in.matches(field, ctx, doc); + if (mi == null) { + return null; + } + return IntervalMatches.asMatches(offset(IntervalMatches.wrapMatches(mi, doc)), mi, doc); + } + + @Override + public void extractTerms(String field, Set terms) { + in.extractTerms(field, terms); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + OffsetIntervalsSource that = (OffsetIntervalsSource) o; + return before == that.before && + Objects.equals(in, that.in); + } + + @Override + public int hashCode() { + return Objects.hash(in, before); + } + + @Override + public String toString() { + if (before) { + return ("PRECEDING(" + in + ")"); + } + return ("FOLLOWING(" + in + ")"); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java index 139cea985c9..3276caa9446 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java @@ -330,6 +330,33 @@ public class TestIntervals extends LuceneTestCase { } + public void testOffsetIntervals() throws IOException { + IntervalsSource source = Intervals.unordered( + Intervals.term("pease"), + Intervals.term("porridge"), + Intervals.or(Intervals.term("hot"), Intervals.term("cold"))); + + IntervalsSource before = new OffsetIntervalsSource(source, true); + checkIntervals(before, "field1", 3, new int[][]{ + {}, + { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 }, + { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 }, + {}, + { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 }, + {} + }); + + IntervalsSource after = new OffsetIntervalsSource(source, false); + checkIntervals(after, "field1", 3, new int[][]{ + {}, + { 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 18, 18 }, + { 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 18, 18 }, + {}, + { 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 18, 18 }, + {} + }); + } + public void testNesting2() throws IOException { IntervalsSource source = Intervals.unordered( Intervals.ordered( @@ -536,4 +563,80 @@ public class TestIntervals extends LuceneTestCase { }); } + public void testAfter() throws IOException { + IntervalsSource source = Intervals.after(Intervals.term("porridge"), + Intervals.ordered(Intervals.term("pease"), Intervals.term("cold"))); + checkIntervals(source, "field1", 3, new int[][]{ + {}, + { 7, 7 }, + { 4, 4, 7, 7 }, + {}, + { 7, 7 }, + {} + }); + + MatchesIterator mi = getMatches(source, 1, "field1"); + assertMatch(mi, 7, 7, 20, 55); + MatchesIterator sub = mi.getSubMatches(); + assertNotNull(sub); + assertMatch(sub, 3, 3, 20, 25); + assertMatch(sub, 5, 5, 35, 39); + assertMatch(sub, 7, 7, 47, 55); + assertFalse(sub.next()); + } + + public void testBefore() throws IOException { + IntervalsSource source = Intervals.before(Intervals.term("cold"), Intervals.term("twelve")); + checkIntervals(source, "field1", 2, new int[][]{ + {}, + {}, + { 2, 2 }, + {}, + { 5, 5 }, + {} + }); + } + + public void testWithin() throws IOException { + IntervalsSource source = Intervals.within(Intervals.term("hot"), 6, + Intervals.or(Intervals.term("porridge"), Intervals.term("fraggle"))); + checkIntervals(source, "field1", 3, new int[][]{ + {}, + { 2, 2 }, + { 5, 5, 21, 21 }, + {}, + { 2, 2 }, + {} + }); + } + + public void testOverlapping() throws IOException { + IntervalsSource source = Intervals.overlapping( + Intervals.unordered(Intervals.term("hot"), Intervals.term("porridge")), + Intervals.unordered(Intervals.term("cold"), Intervals.term("pease")) + ); + checkIntervals(source, "field1", 3, new int[][]{ + {}, + { 2, 4, 7, 17 }, + { 5, 7, 7, 21 }, + {}, + { 2, 4 }, + {} + }); + + assertGaps(source, 2, "field1", new int[]{ 1, 13 }); + + MatchesIterator mi = getMatches(source, 1, "field1"); + assertNotNull(mi); + assertMatch(mi, 2, 4, 15, 39); + MatchesIterator sub = mi.getSubMatches(); + assertNotNull(sub); + assertMatch(sub, 2, 2, 15, 18); + assertMatch(sub, 3, 3, 20, 25); + assertMatch(sub, 4, 4, 26, 34); + assertMatch(sub, 5, 5, 35, 39); + assertFalse(sub.next()); + assertMatch(mi, 7, 17, 41, 118); + } + }