diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/BlockIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/BlockIntervalsSource.java new file mode 100644 index 00000000000..07de202d69d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/BlockIntervalsSource.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +class BlockIntervalsSource extends ConjunctionIntervalsSource { + + static IntervalsSource build(List subSources) { + if (subSources.size() == 1) { + return subSources.get(0); + } + return Intervals.or(Disjunctions.pullUp(subSources, BlockIntervalsSource::new)); + } + + private static List flatten(List sources) { + List flattened = new ArrayList<>(); + for (IntervalsSource s : sources) { + if (s instanceof BlockIntervalsSource) { + flattened.addAll(((BlockIntervalsSource)s).subSources); + } + else { + flattened.add(s); + } + } + return flattened; + } + + private BlockIntervalsSource(List sources) { + super(flatten(sources), true); + } + + @Override + protected IntervalIterator combine(List iterators) { + return new BlockIntervalIterator(iterators); + } + + @Override + public int minExtent() { + int minExtent = 0; + for (IntervalsSource subSource : subSources) { + minExtent += subSource.minExtent(); + } + return minExtent; + } + + @Override + public Collection pullUpDisjunctions() { + return Collections.singletonList(this); // Disjunctions already pulled up in build() + } + + @Override + public int hashCode() { + return Objects.hash(subSources); + } + + @Override + public boolean equals(Object other) { + if (other instanceof BlockIntervalsSource == false) return false; + BlockIntervalsSource b = (BlockIntervalsSource) other; + return Objects.equals(this.subSources, b.subSources); + } + + @Override + public String toString() { + return "BLOCK(" + subSources.stream().map(IntervalsSource::toString).collect(Collectors.joining(",")) + ")"; + } + + private static class BlockIntervalIterator extends ConjunctionIntervalIterator { + + int start = -1, end = -1; + + BlockIntervalIterator(List subIterators) { + super(subIterators); + } + + @Override + public int start() { + return start; + } + + @Override + public int end() { + return end; + } + + @Override + public int gaps() { + return 0; + } + + @Override + public int nextInterval() throws IOException { + if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return start = end = IntervalIterator.NO_MORE_INTERVALS; + int i = 1; + while (i < subIterators.size()) { + while (subIterators.get(i).start() <= subIterators.get(i - 1).end()) { + if (subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return start = end = IntervalIterator.NO_MORE_INTERVALS; + } + if (subIterators.get(i).start() == subIterators.get(i - 1).end() + 1) { + i = i + 1; + } + else { + if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return start = end = IntervalIterator.NO_MORE_INTERVALS; + i = 1; + } + } + start = subIterators.get(0).start(); + end = subIterators.get(subIterators.size() - 1).end(); + return start; + } + + @Override + protected void reset() { + start = end = -1; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ConjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ConjunctionIntervalsSource.java index 123cc38d430..6463dd5dcf8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ConjunctionIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ConjunctionIntervalsSource.java @@ -20,7 +20,6 @@ package org.apache.lucene.search.intervals; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Objects; import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; @@ -31,28 +30,15 @@ import org.apache.lucene.search.MatchesUtils; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; -class ConjunctionIntervalsSource extends IntervalsSource { +abstract class ConjunctionIntervalsSource extends IntervalsSource { protected final List subSources; - protected final IntervalFunction function; + protected final boolean isMinimizing; - ConjunctionIntervalsSource(List subSources, IntervalFunction function) { + protected ConjunctionIntervalsSource(List subSources, boolean isMinimizing) { + assert subSources.size() > 1; this.subSources = subSources; - this.function = function; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ConjunctionIntervalsSource that = (ConjunctionIntervalsSource) o; - return Objects.equals(subSources, that.subSources) && - Objects.equals(function, that.function); - } - - @Override - public String toString() { - return function + subSources.stream().map(Object::toString).collect(Collectors.joining(",", "(", ")")); + this.isMinimizing = isMinimizing; } @Override @@ -65,16 +51,7 @@ class ConjunctionIntervalsSource extends IntervalsSource { } @Override - public int minExtent() { - int minExtent = 0; - for (IntervalsSource source : subSources) { - minExtent += source.minExtent(); - } - return minExtent; - } - - @Override - public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { + public final IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { List subIntervals = new ArrayList<>(); for (IntervalsSource source : subSources) { IntervalIterator it = source.intervals(field, ctx); @@ -82,32 +59,32 @@ class ConjunctionIntervalsSource extends IntervalsSource { return null; subIntervals.add(it); } - return function.apply(subIntervals); + return combine(subIntervals); } + protected abstract IntervalIterator combine(List iterators); + @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { List subs = new ArrayList<>(); for (IntervalsSource source : subSources) { MatchesIterator mi = source.matches(field, ctx, doc); if (mi == null) { return null; } + if (isMinimizing) { + mi = new CachingMatchesIterator(mi); + } subs.add(mi); } - IntervalIterator it = function.apply(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList())); + IntervalIterator it = combine(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList())); if (it.advance(doc) != doc) { return null; } if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { return null; } - return new ConjunctionMatchesIterator(it, subs); - } - - @Override - public int hashCode() { - return Objects.hash(subSources, function); + return isMinimizing ? new MinimizingConjunctionMatchesIterator(it, subs) : new ConjunctionMatchesIterator(it, subs); } private static class ConjunctionMatchesIterator implements MatchesIterator { diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ContainedByIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ContainedByIntervalsSource.java new file mode 100644 index 00000000000..3a9a05efa9f --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ContainedByIntervalsSource.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Objects; + +class ContainedByIntervalsSource extends ConjunctionIntervalsSource { + + static IntervalsSource build(IntervalsSource small, IntervalsSource big) { + return Intervals.or(Disjunctions.pullUp(big, s -> new ContainedByIntervalsSource(small, s))); + } + + private final IntervalsSource small; + private final IntervalsSource big; + + private ContainedByIntervalsSource(IntervalsSource small, IntervalsSource big) { + super(Arrays.asList(small, big), false); + this.small = small; + this.big = big; + } + + @Override + protected IntervalIterator combine(List iterators) { + assert iterators.size() == 2; + IntervalIterator a = iterators.get(0); + IntervalIterator b = iterators.get(1); + return new FilteringIntervalIterator(a, b) { + @Override + public int nextInterval() throws IOException { + if (bpos == false) + return IntervalIterator.NO_MORE_INTERVALS; + while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + while (b.end() < a.end()) { + if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + bpos = false; + return IntervalIterator.NO_MORE_INTERVALS; + } + } + if (b.start() <= a.start()) + return a.start(); + } + bpos = false; + return IntervalIterator.NO_MORE_INTERVALS; + } + }; + } + + @Override + public int minExtent() { + return small.minExtent(); + } + + @Override + public Collection pullUpDisjunctions() { + return Disjunctions.pullUp(big, s -> new ContainedByIntervalsSource(small, s)); + } + + @Override + public int hashCode() { + return Objects.hashCode(subSources); + } + + @Override + public boolean equals(Object other) { + if (other instanceof ContainedByIntervalsSource == false) return false; + ContainedByIntervalsSource o = (ContainedByIntervalsSource) other; + return Objects.equals(this.subSources, o.subSources); + } + + @Override + public String toString() { + return "CONTAINED_BY(" + small + "," + big + ")"; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ContainingIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ContainingIntervalsSource.java new file mode 100644 index 00000000000..5b7d431d4ab --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ContainingIntervalsSource.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Objects; + +class ContainingIntervalsSource extends ConjunctionIntervalsSource { + + private final IntervalsSource big; + private final IntervalsSource small; + + static IntervalsSource build(IntervalsSource big, IntervalsSource small) { + return Intervals.or(Disjunctions.pullUp(big, s -> new ContainingIntervalsSource(s, small))); + } + + private ContainingIntervalsSource(IntervalsSource big, IntervalsSource small) { + super(Arrays.asList(big, small), false); + this.big = big; + this.small = small; + } + + @Override + protected IntervalIterator combine(List iterators) { + assert iterators.size() == 2; + IntervalIterator a = iterators.get(0); + IntervalIterator b = iterators.get(1); + return new FilteringIntervalIterator(a, b) { + @Override + public int nextInterval() throws IOException { + if (bpos == false) + return IntervalIterator.NO_MORE_INTERVALS; + while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + while (b.start() < a.start() && b.end() < a.end()) { + if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return IntervalIterator.NO_MORE_INTERVALS; + } + if (a.start() <= b.start() && a.end() >= b.end()) + return a.start(); + } + return IntervalIterator.NO_MORE_INTERVALS; + } + }; + } + + @Override + public int minExtent() { + return big.minExtent(); + } + + @Override + public Collection pullUpDisjunctions() { + return Disjunctions.pullUp(big, s -> new ContainingIntervalsSource(s, small)); + } + + @Override + public int hashCode() { + return Objects.hash(this.subSources); + } + + @Override + public boolean equals(Object other) { + if (other instanceof ContainingIntervalsSource == false) return false; + ConjunctionIntervalsSource o = (ContainingIntervalsSource) other; + return Objects.equals(this.subSources, o.subSources); + } + + @Override + public String toString() { + return "CONTAINING(" + big + "," + small + ")"; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DifferenceIntervalFunction.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DifferenceIntervalFunction.java deleted file mode 100644 index 8479716305f..00000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DifferenceIntervalFunction.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.search.intervals; - -import java.io.IOException; - -/** - * A function that takes two interval iterators and combines them to produce a third, - * generally by computing a difference interval between them - */ -abstract class DifferenceIntervalFunction { - - @Override - public abstract int hashCode(); - - @Override - public abstract boolean equals(Object obj); - - @Override - public abstract String toString(); - - /** - * Combine two interval iterators into a third - */ - public abstract IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend); - - /** - * Filters the minuend iterator so that only intervals that do not overlap intervals from the - * subtrahend iterator are returned - */ - static final DifferenceIntervalFunction NON_OVERLAPPING = new SingletonFunction("NON_OVERLAPPING") { - @Override - public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) { - return new NonOverlappingIterator(minuend, subtrahend); - } - }; - - /** - * Filters the minuend iterator so that only intervals that do not contain intervals from the - * subtrahend iterator are returned - */ - static final DifferenceIntervalFunction NOT_CONTAINING = new SingletonFunction("NOT_CONTAINING") { - @Override - public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) { - return new NotContainingIterator(minuend, subtrahend); - } - }; - - /** - * Filters the minuend iterator so that only intervals that are not contained by intervals from - * the subtrahend iterator are returned - */ - static final DifferenceIntervalFunction NOT_CONTAINED_BY = new SingletonFunction("NOT_CONTAINED_BY") { - @Override - public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) { - return new NotContainedByIterator(minuend, subtrahend); - } - }; - - private static abstract class RelativeIterator extends IntervalIterator { - - final IntervalIterator a; - final IntervalIterator b; - - boolean bpos; - - RelativeIterator(IntervalIterator a, IntervalIterator b) { - this.a = a; - this.b = b; - } - - @Override - public int docID() { - return a.docID(); - } - - @Override - public int nextDoc() throws IOException { - int doc = a.nextDoc(); - reset(); - return doc; - } - - @Override - public int advance(int target) throws IOException { - int doc = a.advance(target); - reset(); - return doc; - } - - @Override - public long cost() { - return a.cost(); - } - - protected void reset() throws IOException { - int doc = a.docID(); - bpos = b.docID() == doc || - (b.docID() < doc && b.advance(doc) == doc); - } - - @Override - public int start() { - return a.start(); - } - - @Override - public int end() { - return a.end(); - } - - @Override - public int gaps() { - return a.gaps(); - } - - @Override - public float matchCost() { - return a.matchCost() + b.matchCost(); - } - } - - private static class NonOverlappingIterator extends RelativeIterator { - - private NonOverlappingIterator(IntervalIterator minuend, IntervalIterator subtrahend) { - super(minuend, subtrahend); - } - - @Override - public int nextInterval() throws IOException { - if (bpos == false) - return a.nextInterval(); - while (a.nextInterval() != NO_MORE_INTERVALS) { - while (b.end() < a.start()) { - if (b.nextInterval() == NO_MORE_INTERVALS) { - bpos = false; - return a.start(); - } - } - if (b.start() > a.end()) - return a.start(); - } - return NO_MORE_INTERVALS; - } - } - - private static class NotContainingIterator extends RelativeIterator { - - private NotContainingIterator(IntervalIterator minuend, IntervalIterator subtrahend) { - super(minuend, subtrahend); - } - - @Override - public int nextInterval() throws IOException { - if (bpos == false) - return a.nextInterval(); - while (a.nextInterval() != NO_MORE_INTERVALS) { - while (b.start() < a.start() && b.end() < a.end()) { - if (b.nextInterval() == NO_MORE_INTERVALS) { - bpos = false; - return a.start(); - } - } - if (b.start() > a.end()) - return a.start(); - } - return NO_MORE_INTERVALS; - } - - } - - private static class NotContainedByIterator extends RelativeIterator { - - NotContainedByIterator(IntervalIterator a, IntervalIterator b) { - super(a, b); - } - - @Override - public int nextInterval() throws IOException { - if (bpos == false) - return a.nextInterval(); - while (a.nextInterval() != NO_MORE_INTERVALS) { - while (b.end() < a.end()) { - if (b.nextInterval() == NO_MORE_INTERVALS) - return a.start(); - } - if (a.start() < b.start()) - return a.start(); - } - return NO_MORE_INTERVALS; - } - } - - private static abstract class SingletonFunction extends DifferenceIntervalFunction { - - private final String name; - - SingletonFunction(String name) { - this.name = name; - } - - @Override - public int hashCode() { - return System.identityHashCode(this); - } - - @Override - public boolean equals(Object obj) { - return obj == this; - } - - @Override - public String toString() { - return name; - } - - } - - -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DifferenceIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DifferenceIntervalsSource.java index f1d096c0bc4..9dca19e2e7c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DifferenceIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DifferenceIntervalsSource.java @@ -18,38 +18,37 @@ package org.apache.lucene.search.intervals; import java.io.IOException; -import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.QueryVisitor; -class DifferenceIntervalsSource extends IntervalsSource { +abstract class DifferenceIntervalsSource extends IntervalsSource { - private final IntervalsSource minuend; - private final IntervalsSource subtrahend; - private final DifferenceIntervalFunction function; + final IntervalsSource minuend; + final IntervalsSource subtrahend; - DifferenceIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend, DifferenceIntervalFunction function) { + DifferenceIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) { this.minuend = minuend; this.subtrahend = subtrahend; - this.function = function; } + protected abstract IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend); + @Override - public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { + public final IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { IntervalIterator minIt = minuend.intervals(field, ctx); if (minIt == null) return null; IntervalIterator subIt = subtrahend.intervals(field, ctx); if (subIt == null) return minIt; - return function.apply(minIt, subIt); + return combine(minIt, subIt); } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { MatchesIterator minIt = minuend.matches(field, ctx, doc); if (minIt == null) { return null; @@ -58,30 +57,10 @@ class DifferenceIntervalsSource extends IntervalsSource { if (subIt == null) { return minIt; } - IntervalIterator difference = function.apply(IntervalMatches.wrapMatches(minIt, doc), IntervalMatches.wrapMatches(subIt, doc)); + IntervalIterator difference = combine(IntervalMatches.wrapMatches(minIt, doc), IntervalMatches.wrapMatches(subIt, doc)); return IntervalMatches.asMatches(difference, minIt, doc); } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - DifferenceIntervalsSource that = (DifferenceIntervalsSource) o; - return Objects.equals(minuend, that.minuend) && - Objects.equals(subtrahend, that.subtrahend) && - Objects.equals(function, that.function); - } - - @Override - public int hashCode() { - return Objects.hash(minuend, subtrahend, function); - } - - @Override - public String toString() { - return function + "(" + minuend + ", " + subtrahend + ")"; - } - @Override public void visit(String field, QueryVisitor visitor) { IntervalQuery q = new IntervalQuery(field, this); diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java index f37dcb60d32..350a29d89a1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java @@ -19,8 +19,11 @@ package org.apache.lucene.search.intervals; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; @@ -34,10 +37,23 @@ import org.apache.lucene.util.PriorityQueue; class DisjunctionIntervalsSource extends IntervalsSource { - final List subSources; + final Collection subSources; - public DisjunctionIntervalsSource(List subSources) { - this.subSources = subSources; + public DisjunctionIntervalsSource(Collection subSources) { + this.subSources = simplify(subSources); + } + + private static Collection simplify(Collection sources) { + Set simplified = new HashSet<>(); + for (IntervalsSource source : sources) { + if (source instanceof DisjunctionIntervalsSource) { + simplified.addAll(source.pullUpDisjunctions()); + } + else { + simplified.add(source); + } + } + return simplified; } @Override @@ -95,13 +111,18 @@ class DisjunctionIntervalsSource extends IntervalsSource { @Override public int minExtent() { - int minExtent = subSources.get(0).minExtent(); - for (int i = 1; i < subSources.size(); i++) { - minExtent = Math.min(minExtent, subSources.get(i).minExtent()); + int minExtent = Integer.MAX_VALUE; + for (IntervalsSource subSource : subSources) { + minExtent = Math.min(minExtent, subSource.minExtent()); } return minExtent; } + @Override + public Collection pullUpDisjunctions() { + return subSources; + } + static class DisjunctionIntervalIterator extends IntervalIterator { final DocIdSetIterator approximation; diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Disjunctions.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Disjunctions.java new file mode 100644 index 00000000000..d9fbeeec419 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Disjunctions.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; + +import org.apache.lucene.search.BooleanQuery; + +final class Disjunctions { + + // Given a list of sources that contain disjunctions, and a combiner function, + // pulls the disjunctions to the top of the source tree + + // eg FUNC(a, b, OR(c, "d e")) => [FUNC(a, b, c), FUNC(a, b, "d e")] + + public static List pullUp(List sources, + Function, IntervalsSource> function) { + + List> rewritten = new ArrayList<>(); + rewritten.add(new ArrayList<>()); + for (IntervalsSource source : sources) { + List disjuncts = splitDisjunctions(source); + if (disjuncts.size() == 1) { + rewritten.forEach(l -> l.add(disjuncts.get(0))); + } + else { + if (rewritten.size() * disjuncts.size() > BooleanQuery.getMaxClauseCount()) { + throw new IllegalArgumentException("Too many disjunctions to expand"); + } + List> toAdd = new ArrayList<>(); + for (IntervalsSource disj : disjuncts) { + // clone the rewritten list, then append the disjunct + for (List subList : rewritten) { + List l = new ArrayList<>(subList); + l.add(disj); + toAdd.add(l); + } + } + rewritten = toAdd; + } + } + if (rewritten.size() == 1) { + return Collections.singletonList(function.apply(rewritten.get(0))); + } + return rewritten.stream().map(function).collect(Collectors.toList()); + } + + // Given a source containing disjunctions, and a mapping function, + // pulls the disjunctions to the top of the source tree + public static List pullUp(IntervalsSource source, Function function) { + List disjuncts = splitDisjunctions(source); + if (disjuncts.size() == 1) { + return Collections.singletonList(function.apply(disjuncts.get(0))); + } + return disjuncts.stream().map(function).collect(Collectors.toList()); + } + + // Separate out disjunctions into individual sources + // Clauses that have a minExtent of 1 are grouped together and treated as a single + // source, as any overlapping intervals of length 1 can be treated as identical, + // and we know that all combinatorial sources have a minExtent > 1 + private static List splitDisjunctions(IntervalsSource source) { + List singletons = new ArrayList<>(); + List nonSingletons = new ArrayList<>(); + for (IntervalsSource disj : source.pullUpDisjunctions()) { + if (disj.minExtent() == 1) { + singletons.add(disj); + } + else { + nonSingletons.add(disj); + } + } + List split = new ArrayList<>(); + if (singletons.size() > 0) { + split.add(Intervals.or(singletons.toArray(new IntervalsSource[0]))); + } + split.addAll(nonSingletons); + return split; + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ExtendedIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ExtendedIntervalsSource.java index 41a8829b947..0ea60d6b6ca 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ExtendedIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/ExtendedIntervalsSource.java @@ -18,7 +18,10 @@ package org.apache.lucene.search.intervals; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import java.util.Objects; +import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.MatchesIterator; @@ -69,6 +72,15 @@ class ExtendedIntervalsSource extends IntervalsSource { return minExtent; } + @Override + public Collection pullUpDisjunctions() { + Collection inner = source.pullUpDisjunctions(); + if (inner.size() == 0) { + return Collections.singleton(this); + } + return inner.stream().map(s -> new ExtendedIntervalsSource(s, before, after)).collect(Collectors.toSet()); + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteredIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteredIntervalsSource.java index b3341297c94..0bcd8f4820f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteredIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteredIntervalsSource.java @@ -18,7 +18,10 @@ package org.apache.lucene.search.intervals; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import java.util.Objects; +import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.MatchesIterator; @@ -29,15 +32,58 @@ import org.apache.lucene.search.QueryVisitor; */ public abstract class FilteredIntervalsSource extends IntervalsSource { + public static IntervalsSource maxGaps(IntervalsSource in, int maxGaps) { + return Intervals.or(in.pullUpDisjunctions().stream().map(s -> new MaxGaps(s, maxGaps)).collect(Collectors.toList())); + } + + private static class MaxGaps extends FilteredIntervalsSource { + + private final int maxGaps; + + MaxGaps(IntervalsSource in, int maxGaps) { + super("MAXGAPS/" + maxGaps, in); + this.maxGaps = maxGaps; + } + + @Override + protected boolean accept(IntervalIterator it) { + return it.gaps() <= maxGaps; + } + } + + public static IntervalsSource maxWidth(IntervalsSource in, int maxWidth) { + return new MaxWidth(in, maxWidth); + } + + private static class MaxWidth extends FilteredIntervalsSource { + + private final int maxWidth; + + MaxWidth(IntervalsSource in, int maxWidth) { + super("MAXWIDTH/" + maxWidth, in); + this.maxWidth = maxWidth; + } + + @Override + protected boolean accept(IntervalIterator it) { + return (it.end() - it.start()) + 1 <= maxWidth; + } + + @Override + public Collection pullUpDisjunctions() { + return Disjunctions.pullUp(in, s -> new MaxWidth(s, maxWidth)); + } + } + private final String name; - private final IntervalsSource in; + protected final IntervalsSource in; /** * Create a new FilteredIntervalsSource * @param name the name of the filter * @param in the source to filter */ - public FilteredIntervalsSource(String name, IntervalsSource in) { + private FilteredIntervalsSource(String name, IntervalsSource in) { this.name = name; this.in = in; } @@ -81,6 +127,11 @@ public abstract class FilteredIntervalsSource extends IntervalsSource { return in.minExtent(); } + @Override + public Collection pullUpDisjunctions() { + return Collections.singletonList(this); + } + @Override public void visit(String field, QueryVisitor visitor) { in.visit(field, visitor); @@ -89,7 +140,7 @@ public abstract class FilteredIntervalsSource extends IntervalsSource { @Override public boolean equals(Object o) { if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (o == null || o instanceof FilteredIntervalsSource == false) return false; FilteredIntervalsSource that = (FilteredIntervalsSource) o; return Objects.equals(name, that.name) && Objects.equals(in, that.in); diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteringIntervalIterator.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteringIntervalIterator.java new file mode 100644 index 00000000000..b552848179a --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteringIntervalIterator.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Arrays; + +abstract class FilteringIntervalIterator extends ConjunctionIntervalIterator { + + final IntervalIterator a; + final IntervalIterator b; + + boolean bpos; + + protected FilteringIntervalIterator(IntervalIterator a, IntervalIterator b) { + super(Arrays.asList(a, b)); + this.a = a; + this.b = b; + } + + @Override + public int start() { + if (bpos == false) { + return NO_MORE_INTERVALS; + } + return a.start(); + } + + @Override + public int end() { + if (bpos == false) { + return NO_MORE_INTERVALS; + } + return a.end(); + } + + @Override + public int gaps() { + return a.gaps(); + } + + @Override + protected void reset() throws IOException { + bpos = b.nextInterval() != NO_MORE_INTERVALS; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FixedFieldIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FixedFieldIntervalsSource.java index ab24ee359e0..f9e0c51944c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FixedFieldIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FixedFieldIntervalsSource.java @@ -18,7 +18,10 @@ package org.apache.lucene.search.intervals; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import java.util.Objects; +import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.MatchesIterator; @@ -54,6 +57,15 @@ class FixedFieldIntervalsSource extends IntervalsSource { return source.minExtent(); } + @Override + public Collection pullUpDisjunctions() { + Collection inner = source.pullUpDisjunctions(); + if (inner.size() == 1) { + return Collections.singleton(this); + } + return inner.stream().map(s -> new FixedFieldIntervalsSource(field, s)).collect(Collectors.toSet()); + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFilter.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFilter.java index 46c030217e9..dcb59bccc12 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFilter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFilter.java @@ -85,7 +85,7 @@ public abstract class IntervalFilter extends IntervalIterator { do { next = in.nextInterval(); } - while (accept() == false && next != IntervalIterator.NO_MORE_INTERVALS); + while (next != IntervalIterator.NO_MORE_INTERVALS && accept() == false); return next; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java deleted file mode 100644 index 862e9a86a56..00000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalFunction.java +++ /dev/null @@ -1,501 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.search.intervals; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import org.apache.lucene.util.PriorityQueue; - -/** - * Combine a list of {@link IntervalIterator}s into another - */ -abstract class IntervalFunction { - - @Override - public abstract int hashCode(); - - @Override - public abstract boolean equals(Object obj); - - @Override - public abstract String toString(); - - /** - * Combine the iterators into another iterator - */ - public abstract IntervalIterator apply(List iterators); - - static final IntervalFunction BLOCK = new SingletonFunction("BLOCK") { - @Override - public IntervalIterator apply(List iterators) { - return new BlockIntervalIterator(iterators); - } - }; - - private static class BlockIntervalIterator extends ConjunctionIntervalIterator { - - int start = -1, end = -1; - - BlockIntervalIterator(List subIterators) { - super(subIterators); - } - - @Override - public int start() { - return start; - } - - @Override - public int end() { - return end; - } - - @Override - public int gaps() { - return 0; - } - - @Override - public int nextInterval() throws IOException { - if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) - return start = end = IntervalIterator.NO_MORE_INTERVALS; - int i = 1; - while (i < subIterators.size()) { - while (subIterators.get(i).start() <= subIterators.get(i - 1).end()) { - if (subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) - return start = end = IntervalIterator.NO_MORE_INTERVALS; - } - if (subIterators.get(i).start() == subIterators.get(i - 1).end() + 1) { - i = i + 1; - } - else { - if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) - return start = end = IntervalIterator.NO_MORE_INTERVALS; - i = 1; - } - } - start = subIterators.get(0).start(); - end = subIterators.get(subIterators.size() - 1).end(); - return start; - } - - @Override - protected void reset() { - start = end = -1; - } - } - - /** - * Return an iterator over intervals where the subiterators appear in a given order - */ - static final IntervalFunction ORDERED = new SingletonFunction("ORDERED") { - @Override - public IntervalIterator apply(List intervalIterators) { - return new OrderedIntervalIterator(intervalIterators); - } - }; - - private static class OrderedIntervalIterator extends ConjunctionIntervalIterator { - - int start = -1, end = -1, i; - int firstEnd; - - private OrderedIntervalIterator(List subIntervals) { - super(subIntervals); - } - - @Override - public int start() { - return start; - } - - @Override - public int end() { - return end; - } - - @Override - public int nextInterval() throws IOException { - start = end = IntervalIterator.NO_MORE_INTERVALS; - int b = Integer.MAX_VALUE; - i = 1; - while (true) { - while (true) { - if (subIterators.get(i - 1).end() >= b) - return start; - if (i == subIterators.size() || subIterators.get(i).start() > subIterators.get(i - 1).end()) - break; - do { - if (subIterators.get(i).end() >= b || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) - return start; - } - while (subIterators.get(i).start() <= subIterators.get(i - 1).end()); - i++; - } - start = subIterators.get(0).start(); - if (start == NO_MORE_INTERVALS) { - return end = NO_MORE_INTERVALS; - } - firstEnd = subIterators.get(0).end(); - end = subIterators.get(subIterators.size() - 1).end(); - b = subIterators.get(subIterators.size() - 1).start(); - i = 1; - if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) - return start; - } - } - - @Override - public int gaps() { - int gaps = subIterators.get(1).start() - firstEnd - 1; - for (int i = 2; i < subIterators.size(); i++) { - gaps += (subIterators.get(i).start() - subIterators.get(i - 1).end() - 1); - } - return gaps; - } - - @Override - protected void reset() throws IOException { - subIterators.get(0).nextInterval(); - i = 1; - start = end = firstEnd = -1; - } - } - - /** - * Return an iterator over intervals where the subiterators appear in any order - */ - static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") { - @Override - public IntervalIterator apply(List intervalIterators) { - return new UnorderedIntervalIterator(intervalIterators, true); - } - }; - - /** - * Return an iterator over intervals where the subiterators appear in any order, and do not overlap - */ - static final IntervalFunction UNORDERED_NO_OVERLAP = new SingletonFunction("UNORDERED_NO_OVERLAP") { - @Override - public IntervalIterator apply(List iterators) { - return new UnorderedIntervalIterator(iterators, false); - } - }; - - private static class UnorderedIntervalIterator extends ConjunctionIntervalIterator { - - private final PriorityQueue queue; - private final IntervalIterator[] subIterators; - private final int[] innerPositions; - private final boolean allowOverlaps; - - int start = -1, end = -1, firstEnd, queueEnd; - - UnorderedIntervalIterator(List subIterators, boolean allowOverlaps) { - super(subIterators); - this.queue = new PriorityQueue(subIterators.size()) { - @Override - protected boolean lessThan(IntervalIterator a, IntervalIterator b) { - return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end()); - } - }; - this.subIterators = new IntervalIterator[subIterators.size()]; - this.innerPositions = new int[subIterators.size() * 2]; - this.allowOverlaps = allowOverlaps; - - for (int i = 0; i < subIterators.size(); i++) { - this.subIterators[i] = subIterators.get(i); - } - } - - @Override - public int start() { - return start; - } - - @Override - public int end() { - return end; - } - - void updateRightExtreme(IntervalIterator it) { - int itEnd = it.end(); - if (itEnd > queueEnd) { - queueEnd = itEnd; - } - } - - @Override - public int nextInterval() throws IOException { - // first, find a matching interval - while (this.queue.size() == subIterators.length && queue.top().start() == start) { - IntervalIterator it = queue.pop(); - if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { - if (allowOverlaps == false) { - while (hasOverlaps(it)) { - if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) - return start = end = IntervalIterator.NO_MORE_INTERVALS; - } - } - queue.add(it); - updateRightExtreme(it); - } - } - if (this.queue.size() < subIterators.length) - return start = end = IntervalIterator.NO_MORE_INTERVALS; - // then, minimize it - do { - start = queue.top().start(); - firstEnd = queue.top().end(); - end = queueEnd; - if (queue.top().end() == end) - return start; - IntervalIterator it = queue.pop(); - if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { - if (allowOverlaps == false) { - while (hasOverlaps(it)) { - if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { - return start; - } - } - } - queue.add(it); - updateRightExtreme(it); - } - } while (this.queue.size() == subIterators.length && end == queueEnd); - return start; - } - - @Override - public int gaps() { - for (int i = 0; i < subIterators.length; i++) { - if (subIterators[i].end() > end) { - innerPositions[i * 2] = start; - innerPositions[i * 2 + 1] = firstEnd; - } - else { - innerPositions[i * 2] = subIterators[i].start(); - innerPositions[i * 2 + 1] = subIterators[i].end(); - } - } - Arrays.sort(innerPositions); - int gaps = 0; - for (int i = 1; i < subIterators.length; i++) { - gaps += (innerPositions[i * 2] - innerPositions[i * 2 - 1] - 1); - } - return gaps; - } - - @Override - protected void reset() throws IOException { - queueEnd = start = end = -1; - this.queue.clear(); - loop: for (IntervalIterator it : subIterators) { - if (it.nextInterval() == NO_MORE_INTERVALS) { - break; - } - if (allowOverlaps == false) { - while (hasOverlaps(it)) { - if (it.nextInterval() == NO_MORE_INTERVALS) { - break loop; - } - } - } - queue.add(it); - updateRightExtreme(it); - } - } - - private boolean hasOverlaps(IntervalIterator candidate) { - for (IntervalIterator it : queue) { - if (it.start() < candidate.start()) { - if (it.end() >= candidate.start()) { - return true; - } - continue; - } - if (it.start() == candidate.start()) { - return true; - } - if (it.start() <= candidate.end()) { - return true; - } - } - return false; - } - - } - - /** - * Returns an interval over iterators where the first iterator contains intervals from the second - */ - static final IntervalFunction CONTAINING = new SingletonFunction("CONTAINING") { - @Override - public IntervalIterator apply(List iterators) { - if (iterators.size() != 2) - throw new IllegalStateException("CONTAINING function requires two iterators"); - IntervalIterator a = iterators.get(0); - IntervalIterator b = iterators.get(1); - return new FilteringIntervalIterator(a, b) { - @Override - public int nextInterval() throws IOException { - if (bpos == false) - return IntervalIterator.NO_MORE_INTERVALS; - while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { - while (b.start() < a.start() && b.end() < a.end()) { - if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) - return IntervalIterator.NO_MORE_INTERVALS; - } - if (a.start() <= b.start() && a.end() >= b.end()) - return a.start(); - } - return IntervalIterator.NO_MORE_INTERVALS; - } - }; - } - }; - - /** - * Return an iterator over intervals where the first iterator is contained by intervals from the second - */ - static final IntervalFunction CONTAINED_BY = new SingletonFunction("CONTAINED_BY") { - @Override - public IntervalIterator apply(List iterators) { - if (iterators.size() != 2) - throw new IllegalStateException("CONTAINED_BY function requires two iterators"); - IntervalIterator a = iterators.get(0); - IntervalIterator b = iterators.get(1); - return new FilteringIntervalIterator(a, b) { - @Override - public int nextInterval() throws IOException { - if (bpos == false) - return IntervalIterator.NO_MORE_INTERVALS; - while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { - while (b.end() < a.end()) { - if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { - bpos = false; - return IntervalIterator.NO_MORE_INTERVALS; - } - } - if (b.start() <= a.start()) - return a.start(); - } - bpos = false; - return IntervalIterator.NO_MORE_INTERVALS; - } - }; - } - }; - - static final IntervalFunction OVERLAPPING = new SingletonFunction("OVERLAPPING") { - @Override - public IntervalIterator apply(List iterators) { - if (iterators.size() != 2) - throw new IllegalStateException("OVERLAPPING function requires two iterators"); - IntervalIterator a = iterators.get(0); - IntervalIterator b = iterators.get(1); - return new FilteringIntervalIterator(a, b) { - @Override - public int nextInterval() throws IOException { - if (bpos == false) - return IntervalIterator.NO_MORE_INTERVALS; - while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { - while (b.end() < a.start()) { - if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { - bpos = false; - return IntervalIterator.NO_MORE_INTERVALS; - } - } - if (b.start() <= a.end()) - return a.start(); - } - bpos = false; - return IntervalIterator.NO_MORE_INTERVALS; - } - }; - } - }; - - private static abstract class FilteringIntervalIterator extends ConjunctionIntervalIterator { - - final IntervalIterator a; - final IntervalIterator b; - - boolean bpos; - - protected FilteringIntervalIterator(IntervalIterator a, IntervalIterator b) { - super(Arrays.asList(a, b)); - this.a = a; - this.b = b; - } - - @Override - public int start() { - if (bpos == false) { - return NO_MORE_INTERVALS; - } - return a.start(); - } - - @Override - public int end() { - if (bpos == false) { - return NO_MORE_INTERVALS; - } - return a.end(); - } - - @Override - public int gaps() { - return a.gaps(); - } - - @Override - protected void reset() throws IOException { - bpos = b.nextInterval() != NO_MORE_INTERVALS; - } - } - - private static abstract class SingletonFunction extends IntervalFunction { - - private final String name; - - protected SingletonFunction(String name) { - this.name = name; - } - - @Override - public int hashCode() { - return System.identityHashCode(this); - } - - @Override - public boolean equals(Object obj) { - return obj == this; - } - - @Override - public String toString() { - return name; - } - - } - -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java index cdb6105172a..591fbb178aa 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/Intervals.java @@ -18,6 +18,7 @@ package org.apache.lucene.search.intervals; import java.util.Arrays; +import java.util.List; import java.util.function.Predicate; import org.apache.lucene.index.Term; @@ -32,6 +33,15 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; * These sources implement minimum-interval algorithms taken from the paper * * Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics + * + * By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will + * rewrite their sub-sources so that disjunctions of different lengths are pulled up + * to the top of the interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c") + * will automatically rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c")) + * to ensure that documents containing "b c" are matched. This can lead to less efficient + * queries, as more terms need to be loaded (for example, the "c" iterator above is loaded + * twice), so if you care more about speed than about accuracy you can use the + * {@link #or(boolean, IntervalsSource...)} factory method to prevent rewriting. */ public final class Intervals { @@ -87,19 +97,54 @@ public final class Intervals { * Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of IntervalsSources */ public static IntervalsSource phrase(IntervalsSource... subSources) { - if (subSources.length == 1) { - return subSources[0]; - } - return new ConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.BLOCK); + return BlockIntervalsSource.build(Arrays.asList(subSources)); + } + + /** + * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources + * + * Automatically rewrites if wrapped by an interval source that is sensitive to + * internal gaps + */ + public static IntervalsSource or(IntervalsSource... subSources) { + return or(true, Arrays.asList(subSources)); + } + + /** + * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources + * + * @param rewrite if {@code false}, do not rewrite intervals that are sensitive to + * internal gaps; this may run more efficiently, but can miss valid + * hits due to minimization + * @param subSources the sources to combine + */ + public static IntervalsSource or(boolean rewrite, IntervalsSource... subSources) { + return or(rewrite, Arrays.asList(subSources)); } /** * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources */ - public static IntervalsSource or(IntervalsSource... subSources) { - if (subSources.length == 1) - return subSources[0]; - return new DisjunctionIntervalsSource(Arrays.asList(subSources)); + public static IntervalsSource or(List subSources) { + return or(true, subSources); + } + + /** + * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources + * + * @param rewrite if {@code false}, do not rewrite intervals that are sensitive to + * internal gaps; this may run more efficiently, but can miss valid + * hits due to minimization + * @param subSources the sources to combine + */ + public static IntervalsSource or(boolean rewrite, List subSources) { + if (subSources.size() == 1) { + return subSources.get(0); + } + if (rewrite) { + return new DisjunctionIntervalsSource(subSources); + } + return new NoRewriteDisjunctionIntervalsSource(subSources); } /** @@ -130,12 +175,7 @@ public final class Intervals { * @param subSource the sub-source to filter */ public static IntervalsSource maxwidth(int width, IntervalsSource subSource) { - return new FilteredIntervalsSource("MAXWIDTH/" + width, subSource) { - @Override - protected boolean accept(IntervalIterator it) { - return (it.end() - it.start()) + 1 <= width; - } - }; + return FilteredIntervalsSource.maxWidth(subSource, width); } /** @@ -144,12 +184,7 @@ public final class Intervals { * @param subSource the sub-source to filter */ public static IntervalsSource maxgaps(int gaps, IntervalsSource subSource) { - return new FilteredIntervalsSource("MAXGAPS/" + gaps, subSource) { - @Override - protected boolean accept(IntervalIterator it) { - return it.gaps() <= gaps; - } - }; + return FilteredIntervalsSource.maxGaps(subSource, gaps); } /** @@ -181,10 +216,7 @@ public final class Intervals { * @param subSources an ordered set of {@link IntervalsSource} objects */ public static IntervalsSource ordered(IntervalsSource... subSources) { - if (subSources.length == 1) { - return subSources[0]; - } - return new MinimizingConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.ORDERED); + return OrderedIntervalsSource.build(Arrays.asList(subSources)); } /** @@ -207,11 +239,7 @@ public final class Intervals { * @param allowOverlaps whether or not the sources should be allowed to overlap in a hit */ public static IntervalsSource unordered(boolean allowOverlaps, IntervalsSource... subSources) { - if (subSources.length == 1) { - return subSources[0]; - } - return new MinimizingConjunctionIntervalsSource(Arrays.asList(subSources), - allowOverlaps ? IntervalFunction.UNORDERED : IntervalFunction.UNORDERED_NO_OVERLAP); + return UnorderedIntervalsSource.build(Arrays.asList(subSources), allowOverlaps); } /** @@ -234,7 +262,7 @@ public final class Intervals { * @param subtrahend the {@link IntervalsSource} to filter by */ public static IntervalsSource nonOverlapping(IntervalsSource minuend, IntervalsSource subtrahend) { - return new DifferenceIntervalsSource(minuend, subtrahend, DifferenceIntervalFunction.NON_OVERLAPPING); + return new NonOverlappingIntervalsSource(minuend, subtrahend); } /** @@ -243,7 +271,7 @@ public final class Intervals { * @param reference the source to filter by */ public static IntervalsSource overlapping(IntervalsSource source, IntervalsSource reference) { - return new FilteringConjunctionIntervalsSource(source, reference, IntervalFunction.OVERLAPPING); + return new OverlappingIntervalsSource(source, reference); } /** @@ -258,8 +286,7 @@ public final class Intervals { * @param subtrahend the {@link IntervalsSource} to filter by */ public static IntervalsSource notWithin(IntervalsSource minuend, int positions, IntervalsSource subtrahend) { - return new DifferenceIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions), - DifferenceIntervalFunction.NON_OVERLAPPING); + return new NonOverlappingIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions)); } /** @@ -283,7 +310,7 @@ public final class Intervals { * @param subtrahend the {@link IntervalsSource} to filter by */ public static IntervalsSource notContaining(IntervalsSource minuend, IntervalsSource subtrahend) { - return new DifferenceIntervalsSource(minuend, subtrahend, DifferenceIntervalFunction.NOT_CONTAINING); + return NotContainingIntervalsSource.build(minuend, subtrahend); } /** @@ -296,7 +323,7 @@ public final class Intervals { * @param small the {@link IntervalsSource} to filter by */ public static IntervalsSource containing(IntervalsSource big, IntervalsSource small) { - return new FilteringConjunctionIntervalsSource(big, small, IntervalFunction.CONTAINING); + return ContainingIntervalsSource.build(big, small); } /** @@ -309,7 +336,7 @@ public final class Intervals { * @param big the {@link IntervalsSource} to filter by */ public static IntervalsSource notContainedBy(IntervalsSource small, IntervalsSource big) { - return new DifferenceIntervalsSource(small, big, DifferenceIntervalFunction.NOT_CONTAINED_BY); + return NotContainedByIntervalsSource.build(small, big); } /** @@ -321,7 +348,7 @@ public final class Intervals { * @param big the {@link IntervalsSource} to filter by */ public static IntervalsSource containedBy(IntervalsSource small, IntervalsSource big) { - return new FilteringConjunctionIntervalsSource(small, big, IntervalFunction.CONTAINED_BY); + return ContainedByIntervalsSource.build(small, big); } /** @@ -335,18 +362,16 @@ public final class Intervals { * Returns intervals from the source that appear before intervals from the reference */ public static IntervalsSource before(IntervalsSource source, IntervalsSource reference) { - return new FilteringConjunctionIntervalsSource(source, - Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0), - IntervalFunction.CONTAINED_BY); + return ContainedByIntervalsSource.build(source, + Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0)); } /** * Returns intervals from the source that appear after intervals from the reference */ public static IntervalsSource after(IntervalsSource source, IntervalsSource reference) { - return new FilteringConjunctionIntervalsSource(source, - Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE), - IntervalFunction.CONTAINED_BY); + return ContainedByIntervalsSource.build(source, + Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE)); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalsSource.java index 1c49d248fa2..caf8fae2a99 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/IntervalsSource.java @@ -18,6 +18,7 @@ package org.apache.lucene.search.intervals; import java.io.IOException; +import java.util.Collection; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.MatchesIterator; @@ -64,6 +65,13 @@ public abstract class IntervalsSource { */ public abstract int minExtent(); + /** + * Expert: return the set of disjunctions that make up this IntervalsSource + * + * Most implementations can return {@code Collections.singleton(this)} + */ + public abstract Collection pullUpDisjunctions(); + @Override public abstract int hashCode(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimizingConjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimizingConjunctionIntervalsSource.java deleted file mode 100644 index 6e6f563b3dd..00000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimizingConjunctionIntervalsSource.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.search.intervals; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.MatchesIterator; -import org.apache.lucene.search.MatchesUtils; -import org.apache.lucene.search.Query; - -/** - * A ConjunctionIntervalsSource that attempts to minimize its internal intervals by - * eagerly advancing its first subinterval - * - * Uses caching to expose matches after its first subinterval has been moved on - */ -class MinimizingConjunctionIntervalsSource extends ConjunctionIntervalsSource { - - MinimizingConjunctionIntervalsSource(List subSources, IntervalFunction function) { - super(subSources, function); - } - - @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - List subs = new ArrayList<>(); - for (IntervalsSource source : subSources) { - MatchesIterator mi = source.matches(field, ctx, doc); - if (mi == null) { - return null; - } - subs.add(new CachingMatchesIterator(mi)); - } - IntervalIterator it = function.apply(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList())); - if (it.advance(doc) != doc) { - return null; - } - if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { - return null; - } - return new ConjunctionMatchesIterator(it, subs); - } - - private static class ConjunctionMatchesIterator implements IntervalMatchesIterator { - - final IntervalIterator iterator; - final List subs; - boolean cached = true; - - private ConjunctionMatchesIterator(IntervalIterator iterator, List subs) { - this.iterator = iterator; - this.subs = subs; - } - - @Override - public boolean next() throws IOException { - if (cached) { - cached = false; - return true; - } - return iterator.nextInterval() != IntervalIterator.NO_MORE_INTERVALS; - } - - @Override - public int startPosition() { - return iterator.start(); - } - - @Override - public int endPosition() { - return iterator.end(); - } - - @Override - public int startOffset() throws IOException { - int start = Integer.MAX_VALUE; - int endPos = endPosition(); - for (CachingMatchesIterator s : subs) { - start = Math.min(start, s.startOffset(endPos)); - } - return start; - } - - @Override - public int endOffset() throws IOException { - int end = 0; - int endPos = endPosition(); - for (CachingMatchesIterator s : subs) { - end = Math.max(end, s.endOffset(endPos)); - } - return end; - } - - @Override - public int gaps() { - return iterator.gaps(); - } - - @Override - public MatchesIterator getSubMatches() throws IOException { - List mis = new ArrayList<>(); - int endPos = endPosition(); - for (CachingMatchesIterator s : subs) { - mis.add(s.getSubMatches(endPos)); - } - return MatchesUtils.disjunction(mis); - } - - @Override - public Query getQuery() { - return null; - } - } - -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimizingConjunctionMatchesIterator.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimizingConjunctionMatchesIterator.java new file mode 100644 index 00000000000..2460be8d659 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimizingConjunctionMatchesIterator.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.MatchesUtils; +import org.apache.lucene.search.Query; + +class MinimizingConjunctionMatchesIterator implements IntervalMatchesIterator { + + final IntervalIterator iterator; + private final List subs = new ArrayList<>(); + private boolean cached = true; + + MinimizingConjunctionMatchesIterator(IntervalIterator iterator, List subs) { + this.iterator = iterator; + for (MatchesIterator mi : subs) { + assert mi instanceof CachingMatchesIterator; + this.subs.add((CachingMatchesIterator)mi); + } + } + + @Override + public boolean next() throws IOException { + if (cached) { + cached = false; + return true; + } + return iterator.nextInterval() != IntervalIterator.NO_MORE_INTERVALS; + } + + @Override + public int startPosition() { + return iterator.start(); + } + + @Override + public int endPosition() { + return iterator.end(); + } + + @Override + public int startOffset() throws IOException { + int start = Integer.MAX_VALUE; + int endPos = endPosition(); + for (CachingMatchesIterator s : subs) { + start = Math.min(start, s.startOffset(endPos)); + } + return start; + } + + @Override + public int endOffset() throws IOException { + int end = 0; + int endPos = endPosition(); + for (CachingMatchesIterator s : subs) { + end = Math.max(end, s.endOffset(endPos)); + } + return end; + } + + @Override + public int gaps() { + return iterator.gaps(); + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + List mis = new ArrayList<>(); + int endPos = endPosition(); + for (CachingMatchesIterator s : subs) { + mis.add(s.getSubMatches(endPos)); + } + return MatchesUtils.disjunction(mis); + } + + @Override + public Query getQuery() { + return null; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimumShouldMatchIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimumShouldMatchIntervalsSource.java index a06c47839b5..6f5a28d06f8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimumShouldMatchIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MinimumShouldMatchIntervalsSource.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.IdentityHashMap; import java.util.List; import java.util.Map; @@ -107,6 +108,11 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource { return minExtent; } + @Override + public Collection pullUpDisjunctions() { + return Collections.singleton(this); + } + @Override public String toString() { return "AtLeast(" diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java index 1ee52d4c3a0..4b1d23328f3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/MultiTermIntervalsSource.java @@ -19,6 +19,8 @@ package org.apache.lucene.search.intervals; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Objects; @@ -97,6 +99,11 @@ class MultiTermIntervalsSource extends IntervalsSource { return 1; } + @Override + public Collection pullUpDisjunctions() { + return Collections.singleton(this); + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteringConjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NoRewriteDisjunctionIntervalsSource.java similarity index 62% rename from lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteringConjunctionIntervalsSource.java rename to lucene/sandbox/src/java/org/apache/lucene/search/intervals/NoRewriteDisjunctionIntervalsSource.java index cc029821e27..e273f19913c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/FilteringConjunctionIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NoRewriteDisjunctionIntervalsSource.java @@ -17,23 +17,17 @@ package org.apache.lucene.search.intervals; -import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; -/** - * An intervals source that combines two other sources, requiring both of them to - * be present in order to match, but using the minExtent of one of them - */ -class FilteringConjunctionIntervalsSource extends ConjunctionIntervalsSource { +class NoRewriteDisjunctionIntervalsSource extends DisjunctionIntervalsSource { - private final IntervalsSource source; - - FilteringConjunctionIntervalsSource(IntervalsSource source, IntervalsSource filter, IntervalFunction function) { - super(Arrays.asList(source, filter), function); - this.source = source; + public NoRewriteDisjunctionIntervalsSource(Collection subSources) { + super(subSources); } @Override - public int minExtent() { - return source.minExtent(); + public Collection pullUpDisjunctions() { + return Collections.singletonList(this); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NonOverlappingIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NonOverlappingIntervalsSource.java new file mode 100644 index 00000000000..43585321d7e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NonOverlappingIntervalsSource.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Objects; + +class NonOverlappingIntervalsSource extends DifferenceIntervalsSource { + + NonOverlappingIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) { + super(minuend, subtrahend); + } + + @Override + protected IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend) { + return new NonOverlappingIterator(minuend, subtrahend); + } + + @Override + public Collection pullUpDisjunctions() { + return Collections.singletonList(this); + } + + @Override + public int hashCode() { + return Objects.hash(minuend, subtrahend); + } + + @Override + public boolean equals(Object other) { + if (other instanceof NonOverlappingIntervalsSource == false) return false; + NonOverlappingIntervalsSource o = (NonOverlappingIntervalsSource) other; + return Objects.equals(this.minuend, o.minuend) && Objects.equals(this.subtrahend, o.subtrahend); + } + + @Override + public String toString() { + return "NON_OVERLAPPING(" + minuend + "," + subtrahend + ")"; + } + + private static class NonOverlappingIterator extends RelativeIterator { + + private NonOverlappingIterator(IntervalIterator minuend, IntervalIterator subtrahend) { + super(minuend, subtrahend); + } + + @Override + public int nextInterval() throws IOException { + if (bpos == false) + return a.nextInterval(); + while (a.nextInterval() != NO_MORE_INTERVALS) { + while (b.end() < a.start()) { + if (b.nextInterval() == NO_MORE_INTERVALS) { + bpos = false; + return a.start(); + } + } + if (b.start() > a.end()) + return a.start(); + } + return NO_MORE_INTERVALS; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NotContainedByIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NotContainedByIntervalsSource.java new file mode 100644 index 00000000000..cb4beef2aa7 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NotContainedByIntervalsSource.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Objects; + +class NotContainedByIntervalsSource extends DifferenceIntervalsSource { + + static IntervalsSource build(IntervalsSource minuend, IntervalsSource subtrahend) { + return Intervals.or(Disjunctions.pullUp(subtrahend, s -> new NotContainedByIntervalsSource(minuend, s))); + } + + private NotContainedByIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) { + super(minuend, subtrahend); + } + + @Override + protected IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend) { + return new NotContainedByIterator(minuend, subtrahend); + } + + @Override + public Collection pullUpDisjunctions() { + return Collections.singletonList(this); + } + + @Override + public int hashCode() { + return Objects.hash(minuend, subtrahend); + } + + @Override + public boolean equals(Object other) { + if (other instanceof NotContainedByIntervalsSource == false) return false; + NotContainedByIntervalsSource o = (NotContainedByIntervalsSource) other; + return Objects.equals(this.minuend, o.minuend) && Objects.equals(this.subtrahend, o.subtrahend); + } + + @Override + public String toString() { + return "NOT_CONTAINED_BY(" + minuend + "," + subtrahend + ")"; + } + + private static class NotContainedByIterator extends RelativeIterator { + + NotContainedByIterator(IntervalIterator a, IntervalIterator b) { + super(a, b); + } + + @Override + public int nextInterval() throws IOException { + if (bpos == false) + return a.nextInterval(); + while (a.nextInterval() != NO_MORE_INTERVALS) { + while (b.end() < a.end()) { + if (b.nextInterval() == NO_MORE_INTERVALS) + return a.start(); + } + if (a.start() < b.start()) + return a.start(); + } + return NO_MORE_INTERVALS; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NotContainingIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NotContainingIntervalsSource.java new file mode 100644 index 00000000000..769cfe7e4f3 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/NotContainingIntervalsSource.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Objects; + +class NotContainingIntervalsSource extends DifferenceIntervalsSource { + + static IntervalsSource build(IntervalsSource minuend, IntervalsSource subtrahend) { + return Intervals.or(Disjunctions.pullUp(minuend, s -> new NotContainingIntervalsSource(s, subtrahend))); + } + + private NotContainingIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) { + super(minuend, subtrahend); + } + + @Override + protected IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend) { + return new NotContainingIterator(minuend, subtrahend); + } + + @Override + public Collection pullUpDisjunctions() { + return Collections.singletonList(this); + } + + @Override + public int hashCode() { + return Objects.hash(minuend, subtrahend); + } + + @Override + public boolean equals(Object other) { + if (other instanceof NotContainingIntervalsSource == false) return false; + NotContainingIntervalsSource o = (NotContainingIntervalsSource) other; + return Objects.equals(this.minuend, o.minuend) && Objects.equals(this.subtrahend, o.subtrahend); + } + + @Override + public String toString() { + return "NOT_CONTAINING(" + minuend + "," + subtrahend + ")"; + } + + private static class NotContainingIterator extends RelativeIterator { + + private NotContainingIterator(IntervalIterator minuend, IntervalIterator subtrahend) { + super(minuend, subtrahend); + } + + @Override + public int nextInterval() throws IOException { + if (bpos == false) + return a.nextInterval(); + while (a.nextInterval() != NO_MORE_INTERVALS) { + while (b.start() < a.start() && b.end() < a.end()) { + if (b.nextInterval() == NO_MORE_INTERVALS) { + bpos = false; + return a.start(); + } + } + if (b.start() > a.end()) + return a.start(); + } + return NO_MORE_INTERVALS; + } + + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java index 078c64b0270..b10a71fafc4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OffsetIntervalsSource.java @@ -18,6 +18,8 @@ package org.apache.lucene.search.intervals; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; @@ -152,6 +154,11 @@ class OffsetIntervalsSource extends IntervalsSource { return 1; } + @Override + public Collection pullUpDisjunctions() { + return Collections.singleton(this); + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OrderedIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OrderedIntervalsSource.java new file mode 100644 index 00000000000..5569983f164 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OrderedIntervalsSource.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +class OrderedIntervalsSource extends ConjunctionIntervalsSource { + + static IntervalsSource build(List sources) { + if (sources.size() == 1) { + return sources.get(0); + } + return new OrderedIntervalsSource(flatten(sources)); + } + + private static List flatten(List sources) { + List flattened = new ArrayList<>(); + for (IntervalsSource s : sources) { + if (s instanceof OrderedIntervalsSource) { + flattened.addAll(((OrderedIntervalsSource)s).subSources); + } + else { + flattened.add(s); + } + } + return flattened; + } + + private OrderedIntervalsSource(List sources) { + super(sources, true); + } + + @Override + protected IntervalIterator combine(List iterators) { + return new OrderedIntervalIterator(iterators); + } + + @Override + public int minExtent() { + int minExtent = 0; + for (IntervalsSource subSource : subSources) { + minExtent += subSource.minExtent(); + } + return minExtent; + } + + @Override + public Collection pullUpDisjunctions() { + return Disjunctions.pullUp(subSources, OrderedIntervalsSource::new); + } + + @Override + public int hashCode() { + return Objects.hashCode(subSources); + } + + @Override + public boolean equals(Object other) { + if (other instanceof OrderedIntervalsSource == false) return false; + OrderedIntervalsSource s = (OrderedIntervalsSource) other; + return Objects.equals(subSources, s.subSources); + } + + @Override + public String toString() { + return "ORDERED(" + subSources.stream().map(IntervalsSource::toString).collect(Collectors.joining(",")) + ")"; + } + + private static class OrderedIntervalIterator extends ConjunctionIntervalIterator { + + int start = -1, end = -1, i; + int firstEnd; + + private OrderedIntervalIterator(List subIntervals) { + super(subIntervals); + } + + @Override + public int start() { + return start; + } + + @Override + public int end() { + return end; + } + + @Override + public int nextInterval() throws IOException { + start = end = IntervalIterator.NO_MORE_INTERVALS; + int b = Integer.MAX_VALUE; + i = 1; + while (true) { + while (true) { + if (subIterators.get(i - 1).end() >= b) + return start; + if (i == subIterators.size() || subIterators.get(i).start() > subIterators.get(i - 1).end()) + break; + do { + if (subIterators.get(i).end() >= b || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return start; + } + while (subIterators.get(i).start() <= subIterators.get(i - 1).end()); + i++; + } + start = subIterators.get(0).start(); + if (start == NO_MORE_INTERVALS) { + return end = NO_MORE_INTERVALS; + } + firstEnd = subIterators.get(0).end(); + end = subIterators.get(subIterators.size() - 1).end(); + b = subIterators.get(subIterators.size() - 1).start(); + i = 1; + if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return start; + } + } + + @Override + public int gaps() { + int gaps = subIterators.get(1).start() - firstEnd - 1; + for (int i = 2; i < subIterators.size(); i++) { + gaps += (subIterators.get(i).start() - subIterators.get(i - 1).end() - 1); + } + return gaps; + } + + @Override + protected void reset() throws IOException { + subIterators.get(0).nextInterval(); + i = 1; + start = end = firstEnd = -1; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OverlappingIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OverlappingIntervalsSource.java new file mode 100644 index 00000000000..0ad74dc68a6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/OverlappingIntervalsSource.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Objects; + +class OverlappingIntervalsSource extends ConjunctionIntervalsSource { + + private final IntervalsSource source; + private final IntervalsSource reference; + + OverlappingIntervalsSource(IntervalsSource source, IntervalsSource reference) { + super(Arrays.asList(source, reference), false); + this.source = source; + this.reference = reference; + } + + @Override + protected IntervalIterator combine(List iterators) { + assert iterators.size() == 2; + IntervalIterator a = iterators.get(0); + IntervalIterator b = iterators.get(1); + return new FilteringIntervalIterator(a, b) { + @Override + public int nextInterval() throws IOException { + if (bpos == false) + return IntervalIterator.NO_MORE_INTERVALS; + while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + while (b.end() < a.start()) { + if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + bpos = false; + return IntervalIterator.NO_MORE_INTERVALS; + } + } + if (b.start() <= a.end()) + return a.start(); + } + bpos = false; + return IntervalIterator.NO_MORE_INTERVALS; + } + }; + } + + @Override + public int minExtent() { + return source.minExtent(); + } + + @Override + public Collection pullUpDisjunctions() { + return Disjunctions.pullUp(Arrays.asList(source, reference), ss -> new OverlappingIntervalsSource(ss.get(0), ss.get(1))); + } + + @Override + public int hashCode() { + return Objects.hash(this.subSources); + } + + @Override + public boolean equals(Object other) { + if (other instanceof OverlappingIntervalsSource == false) return false; + OverlappingIntervalsSource o = (OverlappingIntervalsSource) other; + return Objects.equals(this.subSources, o.subSources); + } + + @Override + public String toString() { + return "OVERLAPPING(" + source + "," + reference + ")"; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/PayloadFilteredTermIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/PayloadFilteredTermIntervalsSource.java index 4b4b1230d1c..4e747ff96cd 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/PayloadFilteredTermIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/PayloadFilteredTermIntervalsSource.java @@ -18,6 +18,8 @@ package org.apache.lucene.search.intervals; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import java.util.Objects; import java.util.function.Predicate; @@ -224,6 +226,11 @@ class PayloadFilteredTermIntervalsSource extends IntervalsSource { return 1; } + @Override + public Collection pullUpDisjunctions() { + return Collections.singleton(this); + } + @Override public int hashCode() { return Objects.hash(term); diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/RelativeIterator.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/RelativeIterator.java new file mode 100644 index 00000000000..ece14eb3c07 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/RelativeIterator.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; + +abstract class RelativeIterator extends IntervalIterator { + + final IntervalIterator a; + final IntervalIterator b; + + boolean bpos; + + RelativeIterator(IntervalIterator a, IntervalIterator b) { + this.a = a; + this.b = b; + } + + @Override + public int docID() { + return a.docID(); + } + + @Override + public int nextDoc() throws IOException { + int doc = a.nextDoc(); + reset(); + return doc; + } + + @Override + public int advance(int target) throws IOException { + int doc = a.advance(target); + reset(); + return doc; + } + + @Override + public long cost() { + return a.cost(); + } + + protected void reset() throws IOException { + int doc = a.docID(); + bpos = b.docID() == doc || + (b.docID() < doc && b.advance(doc) == doc); + } + + @Override + public int start() { + return a.start(); + } + + @Override + public int end() { + return a.end(); + } + + @Override + public int gaps() { + return a.gaps(); + } + + @Override + public float matchCost() { + return a.matchCost() + b.matchCost(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/TermIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/TermIntervalsSource.java index 3d6fe20b325..e85e0f7ba7d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/TermIntervalsSource.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/TermIntervalsSource.java @@ -18,6 +18,8 @@ package org.apache.lucene.search.intervals; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import java.util.Objects; import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; @@ -208,6 +210,11 @@ class TermIntervalsSource extends IntervalsSource { return 1; } + @Override + public Collection pullUpDisjunctions() { + return Collections.singleton(this); + } + @Override public int hashCode() { return Objects.hash(term); diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/UnorderedIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/UnorderedIntervalsSource.java new file mode 100644 index 00000000000..adf9cc06f1d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/UnorderedIntervalsSource.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +import org.apache.lucene.util.PriorityQueue; + +class UnorderedIntervalsSource extends ConjunctionIntervalsSource { + + static IntervalsSource build(List sources, boolean allowOverlaps) { + if (sources.size() == 1) { + return sources.get(0); + } + return new UnorderedIntervalsSource(flatten(sources, allowOverlaps), allowOverlaps); + } + + private static List flatten(List sources, boolean allowOverlaps) { + List flattened = new ArrayList<>(); + for (IntervalsSource s : sources) { + if (s instanceof UnorderedIntervalsSource && ((UnorderedIntervalsSource)s).allowOverlaps == allowOverlaps) { + flattened.addAll(((UnorderedIntervalsSource)s).subSources); + } + else { + flattened.add(s); + } + } + return flattened; + } + + private final boolean allowOverlaps; + + private UnorderedIntervalsSource(List sources, boolean allowOverlaps) { + super(sources, true); + this.allowOverlaps = allowOverlaps; + } + + @Override + protected IntervalIterator combine(List iterators) { + return new UnorderedIntervalIterator(iterators, allowOverlaps); + } + + @Override + public int minExtent() { + int minExtent = 0; + for (IntervalsSource subSource : subSources) { + minExtent += subSource.minExtent(); + } + return minExtent; + } + + @Override + public Collection pullUpDisjunctions() { + return Disjunctions.pullUp(subSources, ss -> new UnorderedIntervalsSource(ss, allowOverlaps)); + } + + @Override + public int hashCode() { + return Objects.hash(this.subSources, this.allowOverlaps); + } + + @Override + public boolean equals(Object other) { + if (other instanceof UnorderedIntervalsSource == false) return false; + UnorderedIntervalsSource o = (UnorderedIntervalsSource) other; + return Objects.equals(this.subSources, o.subSources) && + Objects.equals(this.allowOverlaps, o.allowOverlaps); + } + + @Override + public String toString() { + return (allowOverlaps ? "UNORDERED(" : "UNORDERED_NO_OVERLAPS(") + + subSources.stream().map(IntervalsSource::toString).collect(Collectors.joining(",")) + ")"; + } + + private static class UnorderedIntervalIterator extends ConjunctionIntervalIterator { + + private final PriorityQueue queue; + private final IntervalIterator[] subIterators; + private final int[] innerPositions; + private final boolean allowOverlaps; + + int start = -1, end = -1, firstEnd, queueEnd; + + UnorderedIntervalIterator(List subIterators, boolean allowOverlaps) { + super(subIterators); + this.queue = new PriorityQueue(subIterators.size()) { + @Override + protected boolean lessThan(IntervalIterator a, IntervalIterator b) { + return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end()); + } + }; + this.subIterators = new IntervalIterator[subIterators.size()]; + this.innerPositions = new int[subIterators.size() * 2]; + this.allowOverlaps = allowOverlaps; + + for (int i = 0; i < subIterators.size(); i++) { + this.subIterators[i] = subIterators.get(i); + } + } + + @Override + public int start() { + return start; + } + + @Override + public int end() { + return end; + } + + void updateRightExtreme(IntervalIterator it) { + int itEnd = it.end(); + if (itEnd > queueEnd) { + queueEnd = itEnd; + } + } + + @Override + public int nextInterval() throws IOException { + // first, find a matching interval + while (this.queue.size() == subIterators.length && queue.top().start() == start) { + IntervalIterator it = queue.pop(); + if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + if (allowOverlaps == false) { + while (hasOverlaps(it)) { + if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + return start = end = IntervalIterator.NO_MORE_INTERVALS; + } + } + queue.add(it); + updateRightExtreme(it); + } + } + if (this.queue.size() < subIterators.length) + return start = end = IntervalIterator.NO_MORE_INTERVALS; + // then, minimize it + do { + start = queue.top().start(); + firstEnd = queue.top().end(); + end = queueEnd; + if (queue.top().end() == end) + return start; + IntervalIterator it = queue.pop(); + if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) { + if (allowOverlaps == false) { + while (hasOverlaps(it)) { + if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + return start; + } + } + } + queue.add(it); + updateRightExtreme(it); + } + } while (this.queue.size() == subIterators.length && end == queueEnd); + return start; + } + + @Override + public int gaps() { + for (int i = 0; i < subIterators.length; i++) { + if (subIterators[i].end() > end) { + innerPositions[i * 2] = start; + innerPositions[i * 2 + 1] = firstEnd; + } + else { + innerPositions[i * 2] = subIterators[i].start(); + innerPositions[i * 2 + 1] = subIterators[i].end(); + } + } + Arrays.sort(innerPositions); + int gaps = 0; + for (int i = 1; i < subIterators.length; i++) { + gaps += (innerPositions[i * 2] - innerPositions[i * 2 - 1] - 1); + } + return gaps; + } + + @Override + protected void reset() throws IOException { + queueEnd = start = end = -1; + this.queue.clear(); + loop: for (IntervalIterator it : subIterators) { + if (it.nextInterval() == NO_MORE_INTERVALS) { + break; + } + if (allowOverlaps == false) { + while (hasOverlaps(it)) { + if (it.nextInterval() == NO_MORE_INTERVALS) { + break loop; + } + } + } + queue.add(it); + updateRightExtreme(it); + } + } + + private boolean hasOverlaps(IntervalIterator candidate) { + for (IntervalIterator it : queue) { + if (it.start() < candidate.start()) { + if (it.end() >= candidate.start()) { + return true; + } + continue; + } + if (it.start() == candidate.start()) { + return true; + } + if (it.start() <= candidate.end()) { + return true; + } + } + return false; + } + + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestDisjunctionRewrites.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestDisjunctionRewrites.java new file mode 100644 index 00000000000..3301a45040a --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestDisjunctionRewrites.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.intervals; + +import org.apache.lucene.util.LuceneTestCase; + +import static org.apache.lucene.search.intervals.Intervals.*; + +public class TestDisjunctionRewrites extends LuceneTestCase { + + public void testDisjunctionSuffix() { + + // BLOCK(a,or(b, BLOCK(b, c))) => or(BLOCK(a, b), BLOCK(a, b, c)) + IntervalsSource actual = Intervals.phrase( + Intervals.term("a"), + Intervals.or(Intervals.term("b"), Intervals.phrase("b", "c"))); + IntervalsSource expected = Intervals.or( + Intervals.phrase("a", "b"), + Intervals.phrase("a", "b", "c")); + assertEquals(expected, actual); + + } + + public void testPhraseDisjunctionWithDifferentLengthClauses() { + + // BLOCK(a, or(b, BLOCK(b, c)), d) => or(BLOCK(a, b, d), BLOCK(a, b, c, d)) + + IntervalsSource actual = Intervals.phrase( + Intervals.term("a"), + Intervals.or(Intervals.term("b"), Intervals.phrase(Intervals.term("b"), Intervals.term("c"))), + Intervals.term("d")); + + IntervalsSource expected = Intervals.or( + Intervals.phrase("a", "b", "d"), + Intervals.phrase("a", "b", "c", "d") + ); + + assertEquals(expected, actual); + + } + + public void testPhraseDisjunctionWithNestedDifferentLengthClauses() { + + // BLOCK(a, or(ORDERED(or(b, c), d), b, p, q), f, g) + // => or(BLOCK(a, or(b, p, q), f, g), BLOCK(a, ORDERED(or(b, c), d), f, g)) + + IntervalsSource expected = Intervals.or( + Intervals.phrase( + Intervals.term("a"), + Intervals.or(Intervals.term("b"), Intervals.term("p"), Intervals.term("q")), + Intervals.term("f"), + Intervals.term("g")), + Intervals.phrase( + Intervals.term("a"), + Intervals.ordered(Intervals.or(Intervals.term("b"), Intervals.term("c")), Intervals.term("d")), + Intervals.term("f"), + Intervals.term("g") + ) + ); + + IntervalsSource actual = Intervals.phrase( + Intervals.term("a"), + Intervals.or( + Intervals.ordered(Intervals.or(Intervals.term("b"), Intervals.term("c")), Intervals.term("d")), + Intervals.term("b"), + Intervals.term("p"), + Intervals.term("q") + ), + Intervals.term("f"), + Intervals.term("g") + ); + + assertEquals(expected, actual); + } + + public void testDisjunctionRewritePreservesFilters() { + + // BLOCK(a, MAXGAPS/3(OR(BLOCK(a, b), BLOCK(c, d))), c) + // => or(BLOCK(a, MAXGAPS/3(BLOCK(a, b)), c), BLOCK(a, MAXGAPS/3(BLOCK(c, d)), c)) + + IntervalsSource actual = Intervals.phrase( + Intervals.term("a"), + Intervals.maxgaps(3, Intervals.or( + Intervals.phrase("a", "b"), + Intervals.phrase("c", "d") + )), + Intervals.term("c") + ); + + IntervalsSource expected = Intervals.or( + Intervals.phrase( + Intervals.term("a"), + Intervals.maxgaps(3, Intervals.phrase("a", "b")), + Intervals.term("c") + ), + Intervals.phrase( + Intervals.term("a"), + Intervals.maxgaps(3, Intervals.phrase("c", "d")), + Intervals.term("c") + )); + + assertEquals(expected, actual); + + } + + public void testNestedMaxGaps() { + // MAXGAPS/3(ORDERED(MAXGAPS/4(ORDERED(a, or(b, BLOCK(c, d)))), e) + // => or(MAXGAPS/3(ORDERED(MAXGAPS/4(ORDERED(a, b)), e)), MAXGAPS/3(ORDERED(MAXGAPS/4(ORDERED(a, BLOCK(c, d))), e))) + IntervalsSource actual = maxgaps(3, ordered( + maxgaps(4, ordered(term("a"), or(term("b"), phrase("c", "d")))), term("e"))); + IntervalsSource expected = or( + maxgaps(3, ordered(maxgaps(4, ordered(term("a"), term("b"))), term("e"))), + maxgaps(3, ordered(maxgaps(4, ordered(term("a"), phrase("c", "d"))), term("e"))) + ); + assertEquals(expected, actual); + } + + public void testNestedMaxWidth() { + // maxwidth does not automatically pull up disjunctions at construction time, so we need to check + // that it does the right thing if wrapped by something that does need exact internal accounting + // PHRASE(a, MAXWIDTH(4, OR(ORDERED(b, c), ORDERED(d, e))), f) + // => or(PHRASE(a, MAXWIDTH(4, ORDERED(b, c)), f), PHRASE(a, MAXWIDTH(4, ORDERED(d, e)), f)) + IntervalsSource actual = phrase(term("a"), maxwidth(4, or(ordered(term("b"), term("c")), ordered(term("d"), term("e")))), term("f")); + IntervalsSource expected = or( + phrase(term("a"), maxwidth(4, ordered(term("b"), term("c"))), term("f")), + phrase(term("a"), maxwidth(4, ordered(term("d"), term("e"))), term("f")) + ); + assertEquals(expected, actual); + } + + public void testNestedFixField() { + // PHRASE(a, FIXFIELD(field, or(PHRASE(a, b), b)), c) + // => or(PHRASE(a, FIXFIELD(PHRASE(a, b)), c), PHRASE(a, FIXFIELD(b), c)) + IntervalsSource actual = phrase(term("a"), fixField("field", or(phrase("a", "b"), term("b"))), term("c")); + IntervalsSource expected = or( + phrase(term("a"), fixField("field", phrase("a", "b")), term("c")), + phrase(term("a"), fixField("field", term("b")), term("c")) + ); + assertEquals(expected, actual); + } + + public void testContainedBy() { + // the 'big' interval should not be minimized, the 'small' one should be + // CONTAINED_BY(or("s", BLOCK("s", "t")), MAXGAPS/4(or(ORDERED("a", "b"), ORDERED("c", "d")))) + // => or(CONTAINED_BY(or("s", BLOCK("s", "t")), MAXGAPS/4(ORDERED("a", "b"))), + // CONTAINED_BY(or("s", BLOCK("s", "t")), MAXGAPS/4(ORDERED("c", "d")))) + IntervalsSource actual = containedBy(or(term("s"), phrase("s", "t")), + maxgaps(4, or(ordered(term("a"), term("b")), ordered(term("c"), term("d"))))); + IntervalsSource expected = or( + containedBy(or(term("s"), phrase("s", "t")), maxgaps(4, ordered(term("a"), term("b")))), + containedBy(or(term("s"), phrase("s", "t")), maxgaps(4, ordered(term("c"), term("d")))) + ); + assertEquals(expected, actual); + } + + public void testContaining() { + // the 'big' interval should not be minimized, the 'small' one should be + // CONTAINING(MAXGAPS/4(or(ORDERED("a", "b"), ORDERED("c", "d"))), or("s", BLOCK("s", "t"))) + // => or(CONTAINING(MAXGAPS/4(ORDERED("a", "b")), or("s", BLOCK("s", "t"))), + // CONTAINING(MAXGAPS/4(ORDERED("c", "d")), or("s", BLOCK("s", "t")))) + IntervalsSource actual = containing(maxgaps(4, or(ordered(term("a"), term("b")), ordered(term("c"), term("d")))), + or(term("s"), phrase("s", "t"))); + IntervalsSource expected = or( + containing(maxgaps(4, ordered(term("a"), term("b"))), or(term("s"), phrase("s", "t"))), + containing(maxgaps(4, ordered(term("c"), term("d"))), or(term("s"), phrase("s", "t"))) + ); + assertEquals(expected, actual); + } + + public void testNotContainedBy() { + // the 'big' interval should not be minimized, the 'small' one should be + // NOT_CONTAINED_BY(or(BLOCK("a", "b"), "a"), or(BLOCK("c", "d"), "d")) + // => or(NOT_CONTAINED_BY(or(BLOCK("a", "b"), "a"), BLOCK("c", "d"))), NOT_CONTAINED_BY(or(BLOCK("a", "b"), "a"), "d")) + IntervalsSource actual = notContainedBy(or(phrase("a", "b"), term("a")), or(phrase("c", "d"), term("d"))); + IntervalsSource expected = or( + notContainedBy(or(phrase("a", "b"), term("a")), phrase("c", "d")), + notContainedBy(or(phrase("a", "b"), term("a")), term("d")) + ); + assertEquals(expected, actual); + } + + public void testNotContaining() { + // the 'big' interval should not be minimized, the 'small' one should be + // NOT_CONTAINING(or(BLOCK("a", "b"), "a"), or(BLOCK("c", "d"), "d")) + // => or(NOT_CONTAINING(BLOCK("a", "b"), or(BLOCK("c", "d"), "d")), NOT_CONTAINING("a", or(BLOCK("c", "d"), "d"))) + IntervalsSource actual = notContaining(or(phrase("a", "b"), term("a")), or(phrase("c", "d"), term("d"))); + IntervalsSource expected = or( + notContaining(phrase("a", "b"), or(phrase("c", "d"), term("d"))), + notContaining(term("a"), or(phrase("c", "d"), term("d"))) + ); + assertEquals(expected, actual); + } + + public void testBlockedRewrites() { + IntervalsSource actual = phrase(term("a"), or(false, phrase("b", "c"), term("c"))); + IntervalsSource ifRewritten = or(phrase("a", "b", "c"), phrase("a", "c")); + assertNotEquals(ifRewritten, actual); + } + +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervalQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervalQuery.java index 9cea616b3a7..a8998ef4648 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervalQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervalQuery.java @@ -31,7 +31,6 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; -import org.junit.Ignore; public class TestIntervalQuery extends LuceneTestCase { @@ -71,7 +70,8 @@ public class TestIntervalQuery extends LuceneTestCase { "w2 w1", "w2 w1 w3 w2 w4", "coordinate genome mapping research", - "coordinate genome research" + "coordinate genome research", + "greater new york" }; private void checkHits(Query query, int[] results) throws IOException { @@ -166,10 +166,6 @@ public class TestIntervalQuery extends LuceneTestCase { checkHits(q, new int[]{}); } - // The Vigna paper doesn't deal with prefix disjunctions. For now, we keep the same - // logic as detailed in the paper, but we may want to address it in future so that tests - // like the one below will pass - @Ignore public void testNestedOr() throws IOException { Query q = new IntervalQuery(field, Intervals.phrase( Intervals.term("coordinate"), @@ -178,6 +174,39 @@ public class TestIntervalQuery extends LuceneTestCase { checkHits(q, new int[]{ 6, 7 }); } + public void testNestedOrWithGaps() throws IOException { + Query q = new IntervalQuery(field, Intervals.phrase( + Intervals.term("coordinate"), + Intervals.or(Intervals.term("genome"), Intervals.extend(Intervals.term("mapping"), 1, 0)), + Intervals.term("research"))); + checkHits(q, new int[]{ 6, 7 }); + } + + public void testNestedOrWithinDifference() throws IOException { + Query q = new IntervalQuery(field, Intervals.phrase( + Intervals.term("coordinate"), + Intervals.notContaining( + Intervals.or(Intervals.phrase("genome", "mapping"), Intervals.term("genome")), + Intervals.term("wibble")), + Intervals.term("research"))); + checkHits(q, new int[]{ 6, 7 }); + } + + public void testNestedOrWithinConjunctionFilter() throws IOException { + Query q = new IntervalQuery(field, Intervals.phrase( + Intervals.term("coordinate"), + Intervals.containing( + Intervals.or(Intervals.phrase("genome", "mapping"), Intervals.term("genome")), + Intervals.term("genome")), + Intervals.term("research"))); + checkHits(q, new int[]{ 6, 7 }); + + q = new IntervalQuery(field, Intervals.phrase( + Intervals.term("greater"), + Intervals.or(Intervals.phrase("new", "york"), Intervals.term("york")))); + checkHits(q, new int[]{ 8 }); + } + public void testUnordered() throws IOException { Query q = new IntervalQuery(field, Intervals.unordered( @@ -191,6 +220,21 @@ public class TestIntervalQuery extends LuceneTestCase { checkHits(q, new int[]{3}); } + public void testNestedOrInUnorderedMaxGaps() throws IOException { + Query q = new IntervalQuery(field, Intervals.maxgaps(1, Intervals.unordered( + Intervals.or(Intervals.term("coordinate"), Intervals.phrase("coordinate", "genome")), + Intervals.term("research")) + )); + checkHits(q, new int[]{ 6, 7 }); + } + + public void testNestedOrInContainedBy() throws IOException { + Query q = new IntervalQuery(field, Intervals.containedBy( + Intervals.term("genome"), + Intervals.or(Intervals.term("coordinate"), Intervals.ordered(Intervals.term("coordinate"), Intervals.term("research"))))); + checkHits(q, new int[]{ 6, 7 }); + } + public void testDefinedGaps() throws IOException { Query q = new IntervalQuery(field, Intervals.phrase(Intervals.term("w1"), Intervals.extend(Intervals.term("w2"), 1, 0))); diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java index c8452a8b67f..ea80d5ffbbf 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java @@ -118,12 +118,12 @@ public class TestIntervals extends LuceneTestCase { if (i >= expected[id].length) { fail("Unexpected match in doc " + id + ": " + intervals); } - assertEquals("Wrong start value in doc " + id, expected[id][i], pos); + assertEquals(source + ": wrong start value in doc " + id, expected[id][i], pos); assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start()); assertEquals("Wrong end value in doc " + id, expected[id][i + 1], intervals.end()); i += 2; } - assertEquals("Wrong number of endpoints in doc " + id, expected[id].length, i); + assertEquals(source + ": wrong number of endpoints in doc " + id, expected[id].length, i); assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.start()); assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.end()); if (i > 0) @@ -762,4 +762,22 @@ public class TestIntervals extends LuceneTestCase { assertMatch(mi, 17, 17, 97, 100); } + public void testWrappedFilters() throws IOException { + + IntervalsSource source = Intervals.or( + Intervals.term("nine"), + Intervals.maxgaps(1, Intervals.or( + Intervals.ordered(Intervals.term("pease"), Intervals.term("hot")), + Intervals.ordered(Intervals.term("pease"), Intervals.term("cold"))))); + checkIntervals(source, "field1", 3, new int[][]{ + {}, + { 0, 2, 3, 5, 11, 11, 28, 28 }, + { 0, 2, 3, 5 }, + {}, + { 0, 2, 3, 5, 11, 11 }, + {} + }); + + } + } diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestSimplifications.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestSimplifications.java index 3c7293b5c21..7f64cd4f832 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestSimplifications.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestSimplifications.java @@ -22,6 +22,7 @@ import org.apache.lucene.util.LuceneTestCase; public class TestSimplifications extends LuceneTestCase { public void testStringPhrases() { + // BLOCK(term) => term IntervalsSource actual = Intervals.phrase("term"); assertEquals(Intervals.term("term"), actual); } @@ -32,18 +33,45 @@ public class TestSimplifications extends LuceneTestCase { } public void testOrdered() { + // ORDERED(term) => term IntervalsSource actual = Intervals.ordered(Intervals.term("term")); assertEquals(Intervals.term("term"), actual); } public void testUnordered() { + // UNORDERED(term) => term IntervalsSource actual = Intervals.unordered(Intervals.term("term")); assertEquals(Intervals.term("term"), actual); } public void testUnorderedOverlaps() { - IntervalsSource actual = Intervals.unordered(true, Intervals.term("term")); + // UNORDERED_NO_OVERLAPS(term) => term + IntervalsSource actual = Intervals.unordered(false, Intervals.term("term")); assertEquals(Intervals.term("term"), actual); } + public void testDisjunctionRemovesDuplicates() { + // or(a, b, a) => or(a, b) + IntervalsSource actual = Intervals.or(Intervals.term("a"), Intervals.term("b"), Intervals.term("a")); + assertEquals(Intervals.or(Intervals.term("a"), Intervals.term("b")), actual); + } + + public void testPhraseSimplification() { + // BLOCK(BLOCK(a, b), c) => BLOCK(a, b, c) + IntervalsSource actual = Intervals.phrase(Intervals.phrase(Intervals.term("a"), Intervals.term("b")), Intervals.term("c")); + assertEquals(Intervals.phrase(Intervals.term("a"), Intervals.term("b"), Intervals.term("c")), actual); + + // BLOCK(a, BLOCK(b, BLOCK(c, d))) => BLOCK(a, b, c, d) + actual = Intervals.phrase(Intervals.term("a"), Intervals.phrase(Intervals.term("b"), + Intervals.phrase(Intervals.term("c"), Intervals.term("d")))); + assertEquals(Intervals.phrase(Intervals.term("a"), Intervals.term("b"), Intervals.term("c"), Intervals.term("d")), actual); + } + + public void testDisjunctionSimplification() { + // or(a, or(b, or(c, d))) => or(a, b, c, d) + IntervalsSource actual = Intervals.or(Intervals.term("a"), Intervals.or(Intervals.term("b"), + Intervals.or(Intervals.term("c"), Intervals.term("d")))); + assertEquals(Intervals.or(Intervals.term("a"), Intervals.term("b"), Intervals.term("c"), Intervals.term("d")), actual); + } + }