LUCENE-8477: Automatically rewrite disjunctions when internal gaps matter (#620)

We have a number of IntervalsSource implementations where automatic minimization of
disjunctions can lead to surprising results:

* PHRASE queries can miss matches because a longer matching sub-source is minimized
  away, leaving a gap
* MAXGAPS queries can miss matches for the same reason
* CONTAINING, NOT_CONTAINING, CONTAINED_BY and NOT_CONTAINED_BY queries
  can miss matches if the 'big' interval gets minimized

The proper way to deal with this is to rewrite the queries by pulling disjunctions to the top
of the query tree, so that PHRASE("a", OR(PHRASE("b", "c"), "c")) is rewritten to
OR(PHRASE("a", "b", "c"), PHRASE("a", "c")). To be able to do this generally, we need to
add a new pullUpDisjunctions() method to IntervalsSource that performs this rewriting
for each source that it would apply to.

Because these rewritten queries will in general be less efficient due to the duplication of
effort (eg the rewritten PHRASE query above pulls 5 term iterators rather than 4 in the
original), we also add an option to Intervals.or() that will prevent this happening, so that
consumers can choose speed over accuracy if it suits their usecase.
This commit is contained in:
Alan Woodward 2019-03-27 11:23:43 +00:00 committed by GitHub
parent e7939d5907
commit f1782d0dd1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 1965 additions and 1013 deletions

View File

@ -0,0 +1,142 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
class BlockIntervalsSource extends ConjunctionIntervalsSource {
static IntervalsSource build(List<IntervalsSource> subSources) {
if (subSources.size() == 1) {
return subSources.get(0);
}
return Intervals.or(Disjunctions.pullUp(subSources, BlockIntervalsSource::new));
}
private static List<IntervalsSource> flatten(List<IntervalsSource> sources) {
List<IntervalsSource> flattened = new ArrayList<>();
for (IntervalsSource s : sources) {
if (s instanceof BlockIntervalsSource) {
flattened.addAll(((BlockIntervalsSource)s).subSources);
}
else {
flattened.add(s);
}
}
return flattened;
}
private BlockIntervalsSource(List<IntervalsSource> sources) {
super(flatten(sources), true);
}
@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
return new BlockIntervalIterator(iterators);
}
@Override
public int minExtent() {
int minExtent = 0;
for (IntervalsSource subSource : subSources) {
minExtent += subSource.minExtent();
}
return minExtent;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singletonList(this); // Disjunctions already pulled up in build()
}
@Override
public int hashCode() {
return Objects.hash(subSources);
}
@Override
public boolean equals(Object other) {
if (other instanceof BlockIntervalsSource == false) return false;
BlockIntervalsSource b = (BlockIntervalsSource) other;
return Objects.equals(this.subSources, b.subSources);
}
@Override
public String toString() {
return "BLOCK(" + subSources.stream().map(IntervalsSource::toString).collect(Collectors.joining(",")) + ")";
}
private static class BlockIntervalIterator extends ConjunctionIntervalIterator {
int start = -1, end = -1;
BlockIntervalIterator(List<IntervalIterator> subIterators) {
super(subIterators);
}
@Override
public int start() {
return start;
}
@Override
public int end() {
return end;
}
@Override
public int gaps() {
return 0;
}
@Override
public int nextInterval() throws IOException {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
int i = 1;
while (i < subIterators.size()) {
while (subIterators.get(i).start() <= subIterators.get(i - 1).end()) {
if (subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
}
if (subIterators.get(i).start() == subIterators.get(i - 1).end() + 1) {
i = i + 1;
}
else {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
i = 1;
}
}
start = subIterators.get(0).start();
end = subIterators.get(subIterators.size() - 1).end();
return start;
}
@Override
protected void reset() {
start = end = -1;
}
}
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.lucene.index.LeafReaderContext;
@ -31,28 +30,15 @@ import org.apache.lucene.search.MatchesUtils;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
class ConjunctionIntervalsSource extends IntervalsSource {
abstract class ConjunctionIntervalsSource extends IntervalsSource {
protected final List<IntervalsSource> subSources;
protected final IntervalFunction function;
protected final boolean isMinimizing;
ConjunctionIntervalsSource(List<IntervalsSource> subSources, IntervalFunction function) {
protected ConjunctionIntervalsSource(List<IntervalsSource> subSources, boolean isMinimizing) {
assert subSources.size() > 1;
this.subSources = subSources;
this.function = function;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ConjunctionIntervalsSource that = (ConjunctionIntervalsSource) o;
return Objects.equals(subSources, that.subSources) &&
Objects.equals(function, that.function);
}
@Override
public String toString() {
return function + subSources.stream().map(Object::toString).collect(Collectors.joining(",", "(", ")"));
this.isMinimizing = isMinimizing;
}
@Override
@ -65,16 +51,7 @@ class ConjunctionIntervalsSource extends IntervalsSource {
}
@Override
public int minExtent() {
int minExtent = 0;
for (IntervalsSource source : subSources) {
minExtent += source.minExtent();
}
return minExtent;
}
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
public final IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
List<IntervalIterator> subIntervals = new ArrayList<>();
for (IntervalsSource source : subSources) {
IntervalIterator it = source.intervals(field, ctx);
@ -82,32 +59,32 @@ class ConjunctionIntervalsSource extends IntervalsSource {
return null;
subIntervals.add(it);
}
return function.apply(subIntervals);
return combine(subIntervals);
}
protected abstract IntervalIterator combine(List<IntervalIterator> iterators);
@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
List<MatchesIterator> subs = new ArrayList<>();
for (IntervalsSource source : subSources) {
MatchesIterator mi = source.matches(field, ctx, doc);
if (mi == null) {
return null;
}
if (isMinimizing) {
mi = new CachingMatchesIterator(mi);
}
subs.add(mi);
}
IntervalIterator it = function.apply(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList()));
IntervalIterator it = combine(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList()));
if (it.advance(doc) != doc) {
return null;
}
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return null;
}
return new ConjunctionMatchesIterator(it, subs);
}
@Override
public int hashCode() {
return Objects.hash(subSources, function);
return isMinimizing ? new MinimizingConjunctionMatchesIterator(it, subs) : new ConjunctionMatchesIterator(it, subs);
}
private static class ConjunctionMatchesIterator implements MatchesIterator {

View File

@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
class ContainedByIntervalsSource extends ConjunctionIntervalsSource {
static IntervalsSource build(IntervalsSource small, IntervalsSource big) {
return Intervals.or(Disjunctions.pullUp(big, s -> new ContainedByIntervalsSource(small, s)));
}
private final IntervalsSource small;
private final IntervalsSource big;
private ContainedByIntervalsSource(IntervalsSource small, IntervalsSource big) {
super(Arrays.asList(small, big), false);
this.small = small;
this.big = big;
}
@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
assert iterators.size() == 2;
IntervalIterator a = iterators.get(0);
IntervalIterator b = iterators.get(1);
return new FilteringIntervalIterator(a, b) {
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.end() < a.end()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
}
if (b.start() <= a.start())
return a.start();
}
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
};
}
@Override
public int minExtent() {
return small.minExtent();
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Disjunctions.pullUp(big, s -> new ContainedByIntervalsSource(small, s));
}
@Override
public int hashCode() {
return Objects.hashCode(subSources);
}
@Override
public boolean equals(Object other) {
if (other instanceof ContainedByIntervalsSource == false) return false;
ContainedByIntervalsSource o = (ContainedByIntervalsSource) other;
return Objects.equals(this.subSources, o.subSources);
}
@Override
public String toString() {
return "CONTAINED_BY(" + small + "," + big + ")";
}
}

View File

@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
class ContainingIntervalsSource extends ConjunctionIntervalsSource {
private final IntervalsSource big;
private final IntervalsSource small;
static IntervalsSource build(IntervalsSource big, IntervalsSource small) {
return Intervals.or(Disjunctions.pullUp(big, s -> new ContainingIntervalsSource(s, small)));
}
private ContainingIntervalsSource(IntervalsSource big, IntervalsSource small) {
super(Arrays.asList(big, small), false);
this.big = big;
this.small = small;
}
@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
assert iterators.size() == 2;
IntervalIterator a = iterators.get(0);
IntervalIterator b = iterators.get(1);
return new FilteringIntervalIterator(a, b) {
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.start() < a.start() && b.end() < a.end()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
}
if (a.start() <= b.start() && a.end() >= b.end())
return a.start();
}
return IntervalIterator.NO_MORE_INTERVALS;
}
};
}
@Override
public int minExtent() {
return big.minExtent();
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Disjunctions.pullUp(big, s -> new ContainingIntervalsSource(s, small));
}
@Override
public int hashCode() {
return Objects.hash(this.subSources);
}
@Override
public boolean equals(Object other) {
if (other instanceof ContainingIntervalsSource == false) return false;
ConjunctionIntervalsSource o = (ContainingIntervalsSource) other;
return Objects.equals(this.subSources, o.subSources);
}
@Override
public String toString() {
return "CONTAINING(" + big + "," + small + ")";
}
}

View File

@ -1,235 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
/**
* A function that takes two interval iterators and combines them to produce a third,
* generally by computing a difference interval between them
*/
abstract class DifferenceIntervalFunction {
@Override
public abstract int hashCode();
@Override
public abstract boolean equals(Object obj);
@Override
public abstract String toString();
/**
* Combine two interval iterators into a third
*/
public abstract IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend);
/**
* Filters the minuend iterator so that only intervals that do not overlap intervals from the
* subtrahend iterator are returned
*/
static final DifferenceIntervalFunction NON_OVERLAPPING = new SingletonFunction("NON_OVERLAPPING") {
@Override
public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) {
return new NonOverlappingIterator(minuend, subtrahend);
}
};
/**
* Filters the minuend iterator so that only intervals that do not contain intervals from the
* subtrahend iterator are returned
*/
static final DifferenceIntervalFunction NOT_CONTAINING = new SingletonFunction("NOT_CONTAINING") {
@Override
public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) {
return new NotContainingIterator(minuend, subtrahend);
}
};
/**
* Filters the minuend iterator so that only intervals that are not contained by intervals from
* the subtrahend iterator are returned
*/
static final DifferenceIntervalFunction NOT_CONTAINED_BY = new SingletonFunction("NOT_CONTAINED_BY") {
@Override
public IntervalIterator apply(IntervalIterator minuend, IntervalIterator subtrahend) {
return new NotContainedByIterator(minuend, subtrahend);
}
};
private static abstract class RelativeIterator extends IntervalIterator {
final IntervalIterator a;
final IntervalIterator b;
boolean bpos;
RelativeIterator(IntervalIterator a, IntervalIterator b) {
this.a = a;
this.b = b;
}
@Override
public int docID() {
return a.docID();
}
@Override
public int nextDoc() throws IOException {
int doc = a.nextDoc();
reset();
return doc;
}
@Override
public int advance(int target) throws IOException {
int doc = a.advance(target);
reset();
return doc;
}
@Override
public long cost() {
return a.cost();
}
protected void reset() throws IOException {
int doc = a.docID();
bpos = b.docID() == doc ||
(b.docID() < doc && b.advance(doc) == doc);
}
@Override
public int start() {
return a.start();
}
@Override
public int end() {
return a.end();
}
@Override
public int gaps() {
return a.gaps();
}
@Override
public float matchCost() {
return a.matchCost() + b.matchCost();
}
}
private static class NonOverlappingIterator extends RelativeIterator {
private NonOverlappingIterator(IntervalIterator minuend, IntervalIterator subtrahend) {
super(minuend, subtrahend);
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return a.nextInterval();
while (a.nextInterval() != NO_MORE_INTERVALS) {
while (b.end() < a.start()) {
if (b.nextInterval() == NO_MORE_INTERVALS) {
bpos = false;
return a.start();
}
}
if (b.start() > a.end())
return a.start();
}
return NO_MORE_INTERVALS;
}
}
private static class NotContainingIterator extends RelativeIterator {
private NotContainingIterator(IntervalIterator minuend, IntervalIterator subtrahend) {
super(minuend, subtrahend);
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return a.nextInterval();
while (a.nextInterval() != NO_MORE_INTERVALS) {
while (b.start() < a.start() && b.end() < a.end()) {
if (b.nextInterval() == NO_MORE_INTERVALS) {
bpos = false;
return a.start();
}
}
if (b.start() > a.end())
return a.start();
}
return NO_MORE_INTERVALS;
}
}
private static class NotContainedByIterator extends RelativeIterator {
NotContainedByIterator(IntervalIterator a, IntervalIterator b) {
super(a, b);
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return a.nextInterval();
while (a.nextInterval() != NO_MORE_INTERVALS) {
while (b.end() < a.end()) {
if (b.nextInterval() == NO_MORE_INTERVALS)
return a.start();
}
if (a.start() < b.start())
return a.start();
}
return NO_MORE_INTERVALS;
}
}
private static abstract class SingletonFunction extends DifferenceIntervalFunction {
private final String name;
SingletonFunction(String name) {
this.name = name;
}
@Override
public int hashCode() {
return System.identityHashCode(this);
}
@Override
public boolean equals(Object obj) {
return obj == this;
}
@Override
public String toString() {
return name;
}
}
}

View File

@ -18,38 +18,37 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.QueryVisitor;
class DifferenceIntervalsSource extends IntervalsSource {
abstract class DifferenceIntervalsSource extends IntervalsSource {
private final IntervalsSource minuend;
private final IntervalsSource subtrahend;
private final DifferenceIntervalFunction function;
final IntervalsSource minuend;
final IntervalsSource subtrahend;
DifferenceIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend, DifferenceIntervalFunction function) {
DifferenceIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) {
this.minuend = minuend;
this.subtrahend = subtrahend;
this.function = function;
}
protected abstract IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend);
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
public final IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
IntervalIterator minIt = minuend.intervals(field, ctx);
if (minIt == null)
return null;
IntervalIterator subIt = subtrahend.intervals(field, ctx);
if (subIt == null)
return minIt;
return function.apply(minIt, subIt);
return combine(minIt, subIt);
}
@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
MatchesIterator minIt = minuend.matches(field, ctx, doc);
if (minIt == null) {
return null;
@ -58,30 +57,10 @@ class DifferenceIntervalsSource extends IntervalsSource {
if (subIt == null) {
return minIt;
}
IntervalIterator difference = function.apply(IntervalMatches.wrapMatches(minIt, doc), IntervalMatches.wrapMatches(subIt, doc));
IntervalIterator difference = combine(IntervalMatches.wrapMatches(minIt, doc), IntervalMatches.wrapMatches(subIt, doc));
return IntervalMatches.asMatches(difference, minIt, doc);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
DifferenceIntervalsSource that = (DifferenceIntervalsSource) o;
return Objects.equals(minuend, that.minuend) &&
Objects.equals(subtrahend, that.subtrahend) &&
Objects.equals(function, that.function);
}
@Override
public int hashCode() {
return Objects.hash(minuend, subtrahend, function);
}
@Override
public String toString() {
return function + "(" + minuend + ", " + subtrahend + ")";
}
@Override
public void visit(String field, QueryVisitor visitor) {
IntervalQuery q = new IntervalQuery(field, this);

View File

@ -19,8 +19,11 @@ package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.index.LeafReaderContext;
@ -34,10 +37,23 @@ import org.apache.lucene.util.PriorityQueue;
class DisjunctionIntervalsSource extends IntervalsSource {
final List<IntervalsSource> subSources;
final Collection<IntervalsSource> subSources;
public DisjunctionIntervalsSource(List<IntervalsSource> subSources) {
this.subSources = subSources;
public DisjunctionIntervalsSource(Collection<IntervalsSource> subSources) {
this.subSources = simplify(subSources);
}
private static Collection<IntervalsSource> simplify(Collection<IntervalsSource> sources) {
Set<IntervalsSource> simplified = new HashSet<>();
for (IntervalsSource source : sources) {
if (source instanceof DisjunctionIntervalsSource) {
simplified.addAll(source.pullUpDisjunctions());
}
else {
simplified.add(source);
}
}
return simplified;
}
@Override
@ -95,13 +111,18 @@ class DisjunctionIntervalsSource extends IntervalsSource {
@Override
public int minExtent() {
int minExtent = subSources.get(0).minExtent();
for (int i = 1; i < subSources.size(); i++) {
minExtent = Math.min(minExtent, subSources.get(i).minExtent());
int minExtent = Integer.MAX_VALUE;
for (IntervalsSource subSource : subSources) {
minExtent = Math.min(minExtent, subSource.minExtent());
}
return minExtent;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return subSources;
}
static class DisjunctionIntervalIterator extends IntervalIterator {
final DocIdSetIterator approximation;

View File

@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.lucene.search.BooleanQuery;
final class Disjunctions {
// Given a list of sources that contain disjunctions, and a combiner function,
// pulls the disjunctions to the top of the source tree
// eg FUNC(a, b, OR(c, "d e")) => [FUNC(a, b, c), FUNC(a, b, "d e")]
public static List<IntervalsSource> pullUp(List<IntervalsSource> sources,
Function<List<IntervalsSource>, IntervalsSource> function) {
List<List<IntervalsSource>> rewritten = new ArrayList<>();
rewritten.add(new ArrayList<>());
for (IntervalsSource source : sources) {
List<IntervalsSource> disjuncts = splitDisjunctions(source);
if (disjuncts.size() == 1) {
rewritten.forEach(l -> l.add(disjuncts.get(0)));
}
else {
if (rewritten.size() * disjuncts.size() > BooleanQuery.getMaxClauseCount()) {
throw new IllegalArgumentException("Too many disjunctions to expand");
}
List<List<IntervalsSource>> toAdd = new ArrayList<>();
for (IntervalsSource disj : disjuncts) {
// clone the rewritten list, then append the disjunct
for (List<IntervalsSource> subList : rewritten) {
List<IntervalsSource> l = new ArrayList<>(subList);
l.add(disj);
toAdd.add(l);
}
}
rewritten = toAdd;
}
}
if (rewritten.size() == 1) {
return Collections.singletonList(function.apply(rewritten.get(0)));
}
return rewritten.stream().map(function).collect(Collectors.toList());
}
// Given a source containing disjunctions, and a mapping function,
// pulls the disjunctions to the top of the source tree
public static List<IntervalsSource> pullUp(IntervalsSource source, Function<IntervalsSource, IntervalsSource> function) {
List<IntervalsSource> disjuncts = splitDisjunctions(source);
if (disjuncts.size() == 1) {
return Collections.singletonList(function.apply(disjuncts.get(0)));
}
return disjuncts.stream().map(function).collect(Collectors.toList());
}
// Separate out disjunctions into individual sources
// Clauses that have a minExtent of 1 are grouped together and treated as a single
// source, as any overlapping intervals of length 1 can be treated as identical,
// and we know that all combinatorial sources have a minExtent > 1
private static List<IntervalsSource> splitDisjunctions(IntervalsSource source) {
List<IntervalsSource> singletons = new ArrayList<>();
List<IntervalsSource> nonSingletons = new ArrayList<>();
for (IntervalsSource disj : source.pullUpDisjunctions()) {
if (disj.minExtent() == 1) {
singletons.add(disj);
}
else {
nonSingletons.add(disj);
}
}
List<IntervalsSource> split = new ArrayList<>();
if (singletons.size() > 0) {
split.add(Intervals.or(singletons.toArray(new IntervalsSource[0])));
}
split.addAll(nonSingletons);
return split;
}
}

View File

@ -18,7 +18,10 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
@ -69,6 +72,15 @@ class ExtendedIntervalsSource extends IntervalsSource {
return minExtent;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
Collection<IntervalsSource> inner = source.pullUpDisjunctions();
if (inner.size() == 0) {
return Collections.singleton(this);
}
return inner.stream().map(s -> new ExtendedIntervalsSource(s, before, after)).collect(Collectors.toSet());
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -18,7 +18,10 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
@ -29,15 +32,58 @@ import org.apache.lucene.search.QueryVisitor;
*/
public abstract class FilteredIntervalsSource extends IntervalsSource {
public static IntervalsSource maxGaps(IntervalsSource in, int maxGaps) {
return Intervals.or(in.pullUpDisjunctions().stream().map(s -> new MaxGaps(s, maxGaps)).collect(Collectors.toList()));
}
private static class MaxGaps extends FilteredIntervalsSource {
private final int maxGaps;
MaxGaps(IntervalsSource in, int maxGaps) {
super("MAXGAPS/" + maxGaps, in);
this.maxGaps = maxGaps;
}
@Override
protected boolean accept(IntervalIterator it) {
return it.gaps() <= maxGaps;
}
}
public static IntervalsSource maxWidth(IntervalsSource in, int maxWidth) {
return new MaxWidth(in, maxWidth);
}
private static class MaxWidth extends FilteredIntervalsSource {
private final int maxWidth;
MaxWidth(IntervalsSource in, int maxWidth) {
super("MAXWIDTH/" + maxWidth, in);
this.maxWidth = maxWidth;
}
@Override
protected boolean accept(IntervalIterator it) {
return (it.end() - it.start()) + 1 <= maxWidth;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Disjunctions.pullUp(in, s -> new MaxWidth(s, maxWidth));
}
}
private final String name;
private final IntervalsSource in;
protected final IntervalsSource in;
/**
* Create a new FilteredIntervalsSource
* @param name the name of the filter
* @param in the source to filter
*/
public FilteredIntervalsSource(String name, IntervalsSource in) {
private FilteredIntervalsSource(String name, IntervalsSource in) {
this.name = name;
this.in = in;
}
@ -81,6 +127,11 @@ public abstract class FilteredIntervalsSource extends IntervalsSource {
return in.minExtent();
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singletonList(this);
}
@Override
public void visit(String field, QueryVisitor visitor) {
in.visit(field, visitor);
@ -89,7 +140,7 @@ public abstract class FilteredIntervalsSource extends IntervalsSource {
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (o == null || o instanceof FilteredIntervalsSource == false) return false;
FilteredIntervalsSource that = (FilteredIntervalsSource) o;
return Objects.equals(name, that.name) &&
Objects.equals(in, that.in);

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Arrays;
abstract class FilteringIntervalIterator extends ConjunctionIntervalIterator {
final IntervalIterator a;
final IntervalIterator b;
boolean bpos;
protected FilteringIntervalIterator(IntervalIterator a, IntervalIterator b) {
super(Arrays.asList(a, b));
this.a = a;
this.b = b;
}
@Override
public int start() {
if (bpos == false) {
return NO_MORE_INTERVALS;
}
return a.start();
}
@Override
public int end() {
if (bpos == false) {
return NO_MORE_INTERVALS;
}
return a.end();
}
@Override
public int gaps() {
return a.gaps();
}
@Override
protected void reset() throws IOException {
bpos = b.nextInterval() != NO_MORE_INTERVALS;
}
}

View File

@ -18,7 +18,10 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
@ -54,6 +57,15 @@ class FixedFieldIntervalsSource extends IntervalsSource {
return source.minExtent();
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
Collection<IntervalsSource> inner = source.pullUpDisjunctions();
if (inner.size() == 1) {
return Collections.singleton(this);
}
return inner.stream().map(s -> new FixedFieldIntervalsSource(field, s)).collect(Collectors.toSet());
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -85,7 +85,7 @@ public abstract class IntervalFilter extends IntervalIterator {
do {
next = in.nextInterval();
}
while (accept() == false && next != IntervalIterator.NO_MORE_INTERVALS);
while (next != IntervalIterator.NO_MORE_INTERVALS && accept() == false);
return next;
}

View File

@ -1,501 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.util.PriorityQueue;
/**
* Combine a list of {@link IntervalIterator}s into another
*/
abstract class IntervalFunction {
@Override
public abstract int hashCode();
@Override
public abstract boolean equals(Object obj);
@Override
public abstract String toString();
/**
* Combine the iterators into another iterator
*/
public abstract IntervalIterator apply(List<IntervalIterator> iterators);
static final IntervalFunction BLOCK = new SingletonFunction("BLOCK") {
@Override
public IntervalIterator apply(List<IntervalIterator> iterators) {
return new BlockIntervalIterator(iterators);
}
};
private static class BlockIntervalIterator extends ConjunctionIntervalIterator {
int start = -1, end = -1;
BlockIntervalIterator(List<IntervalIterator> subIterators) {
super(subIterators);
}
@Override
public int start() {
return start;
}
@Override
public int end() {
return end;
}
@Override
public int gaps() {
return 0;
}
@Override
public int nextInterval() throws IOException {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
int i = 1;
while (i < subIterators.size()) {
while (subIterators.get(i).start() <= subIterators.get(i - 1).end()) {
if (subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
}
if (subIterators.get(i).start() == subIterators.get(i - 1).end() + 1) {
i = i + 1;
}
else {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
i = 1;
}
}
start = subIterators.get(0).start();
end = subIterators.get(subIterators.size() - 1).end();
return start;
}
@Override
protected void reset() {
start = end = -1;
}
}
/**
* Return an iterator over intervals where the subiterators appear in a given order
*/
static final IntervalFunction ORDERED = new SingletonFunction("ORDERED") {
@Override
public IntervalIterator apply(List<IntervalIterator> intervalIterators) {
return new OrderedIntervalIterator(intervalIterators);
}
};
private static class OrderedIntervalIterator extends ConjunctionIntervalIterator {
int start = -1, end = -1, i;
int firstEnd;
private OrderedIntervalIterator(List<IntervalIterator> subIntervals) {
super(subIntervals);
}
@Override
public int start() {
return start;
}
@Override
public int end() {
return end;
}
@Override
public int nextInterval() throws IOException {
start = end = IntervalIterator.NO_MORE_INTERVALS;
int b = Integer.MAX_VALUE;
i = 1;
while (true) {
while (true) {
if (subIterators.get(i - 1).end() >= b)
return start;
if (i == subIterators.size() || subIterators.get(i).start() > subIterators.get(i - 1).end())
break;
do {
if (subIterators.get(i).end() >= b || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start;
}
while (subIterators.get(i).start() <= subIterators.get(i - 1).end());
i++;
}
start = subIterators.get(0).start();
if (start == NO_MORE_INTERVALS) {
return end = NO_MORE_INTERVALS;
}
firstEnd = subIterators.get(0).end();
end = subIterators.get(subIterators.size() - 1).end();
b = subIterators.get(subIterators.size() - 1).start();
i = 1;
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start;
}
}
@Override
public int gaps() {
int gaps = subIterators.get(1).start() - firstEnd - 1;
for (int i = 2; i < subIterators.size(); i++) {
gaps += (subIterators.get(i).start() - subIterators.get(i - 1).end() - 1);
}
return gaps;
}
@Override
protected void reset() throws IOException {
subIterators.get(0).nextInterval();
i = 1;
start = end = firstEnd = -1;
}
}
/**
* Return an iterator over intervals where the subiterators appear in any order
*/
static final IntervalFunction UNORDERED = new SingletonFunction("UNORDERED") {
@Override
public IntervalIterator apply(List<IntervalIterator> intervalIterators) {
return new UnorderedIntervalIterator(intervalIterators, true);
}
};
/**
* Return an iterator over intervals where the subiterators appear in any order, and do not overlap
*/
static final IntervalFunction UNORDERED_NO_OVERLAP = new SingletonFunction("UNORDERED_NO_OVERLAP") {
@Override
public IntervalIterator apply(List<IntervalIterator> iterators) {
return new UnorderedIntervalIterator(iterators, false);
}
};
private static class UnorderedIntervalIterator extends ConjunctionIntervalIterator {
private final PriorityQueue<IntervalIterator> queue;
private final IntervalIterator[] subIterators;
private final int[] innerPositions;
private final boolean allowOverlaps;
int start = -1, end = -1, firstEnd, queueEnd;
UnorderedIntervalIterator(List<IntervalIterator> subIterators, boolean allowOverlaps) {
super(subIterators);
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
@Override
protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end());
}
};
this.subIterators = new IntervalIterator[subIterators.size()];
this.innerPositions = new int[subIterators.size() * 2];
this.allowOverlaps = allowOverlaps;
for (int i = 0; i < subIterators.size(); i++) {
this.subIterators[i] = subIterators.get(i);
}
}
@Override
public int start() {
return start;
}
@Override
public int end() {
return end;
}
void updateRightExtreme(IntervalIterator it) {
int itEnd = it.end();
if (itEnd > queueEnd) {
queueEnd = itEnd;
}
}
@Override
public int nextInterval() throws IOException {
// first, find a matching interval
while (this.queue.size() == subIterators.length && queue.top().start() == start) {
IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
}
}
queue.add(it);
updateRightExtreme(it);
}
}
if (this.queue.size() < subIterators.length)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
// then, minimize it
do {
start = queue.top().start();
firstEnd = queue.top().end();
end = queueEnd;
if (queue.top().end() == end)
return start;
IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return start;
}
}
}
queue.add(it);
updateRightExtreme(it);
}
} while (this.queue.size() == subIterators.length && end == queueEnd);
return start;
}
@Override
public int gaps() {
for (int i = 0; i < subIterators.length; i++) {
if (subIterators[i].end() > end) {
innerPositions[i * 2] = start;
innerPositions[i * 2 + 1] = firstEnd;
}
else {
innerPositions[i * 2] = subIterators[i].start();
innerPositions[i * 2 + 1] = subIterators[i].end();
}
}
Arrays.sort(innerPositions);
int gaps = 0;
for (int i = 1; i < subIterators.length; i++) {
gaps += (innerPositions[i * 2] - innerPositions[i * 2 - 1] - 1);
}
return gaps;
}
@Override
protected void reset() throws IOException {
queueEnd = start = end = -1;
this.queue.clear();
loop: for (IntervalIterator it : subIterators) {
if (it.nextInterval() == NO_MORE_INTERVALS) {
break;
}
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == NO_MORE_INTERVALS) {
break loop;
}
}
}
queue.add(it);
updateRightExtreme(it);
}
}
private boolean hasOverlaps(IntervalIterator candidate) {
for (IntervalIterator it : queue) {
if (it.start() < candidate.start()) {
if (it.end() >= candidate.start()) {
return true;
}
continue;
}
if (it.start() == candidate.start()) {
return true;
}
if (it.start() <= candidate.end()) {
return true;
}
}
return false;
}
}
/**
* Returns an interval over iterators where the first iterator contains intervals from the second
*/
static final IntervalFunction CONTAINING = new SingletonFunction("CONTAINING") {
@Override
public IntervalIterator apply(List<IntervalIterator> iterators) {
if (iterators.size() != 2)
throw new IllegalStateException("CONTAINING function requires two iterators");
IntervalIterator a = iterators.get(0);
IntervalIterator b = iterators.get(1);
return new FilteringIntervalIterator(a, b) {
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.start() < a.start() && b.end() < a.end()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return IntervalIterator.NO_MORE_INTERVALS;
}
if (a.start() <= b.start() && a.end() >= b.end())
return a.start();
}
return IntervalIterator.NO_MORE_INTERVALS;
}
};
}
};
/**
* Return an iterator over intervals where the first iterator is contained by intervals from the second
*/
static final IntervalFunction CONTAINED_BY = new SingletonFunction("CONTAINED_BY") {
@Override
public IntervalIterator apply(List<IntervalIterator> iterators) {
if (iterators.size() != 2)
throw new IllegalStateException("CONTAINED_BY function requires two iterators");
IntervalIterator a = iterators.get(0);
IntervalIterator b = iterators.get(1);
return new FilteringIntervalIterator(a, b) {
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.end() < a.end()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
}
if (b.start() <= a.start())
return a.start();
}
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
};
}
};
static final IntervalFunction OVERLAPPING = new SingletonFunction("OVERLAPPING") {
@Override
public IntervalIterator apply(List<IntervalIterator> iterators) {
if (iterators.size() != 2)
throw new IllegalStateException("OVERLAPPING function requires two iterators");
IntervalIterator a = iterators.get(0);
IntervalIterator b = iterators.get(1);
return new FilteringIntervalIterator(a, b) {
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.end() < a.start()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
}
if (b.start() <= a.end())
return a.start();
}
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
};
}
};
private static abstract class FilteringIntervalIterator extends ConjunctionIntervalIterator {
final IntervalIterator a;
final IntervalIterator b;
boolean bpos;
protected FilteringIntervalIterator(IntervalIterator a, IntervalIterator b) {
super(Arrays.asList(a, b));
this.a = a;
this.b = b;
}
@Override
public int start() {
if (bpos == false) {
return NO_MORE_INTERVALS;
}
return a.start();
}
@Override
public int end() {
if (bpos == false) {
return NO_MORE_INTERVALS;
}
return a.end();
}
@Override
public int gaps() {
return a.gaps();
}
@Override
protected void reset() throws IOException {
bpos = b.nextInterval() != NO_MORE_INTERVALS;
}
}
private static abstract class SingletonFunction extends IntervalFunction {
private final String name;
protected SingletonFunction(String name) {
this.name = name;
}
@Override
public int hashCode() {
return System.identityHashCode(this);
}
@Override
public boolean equals(Object obj) {
return obj == this;
}
@Override
public String toString() {
return name;
}
}
}

View File

@ -18,6 +18,7 @@
package org.apache.lucene.search.intervals;
import java.util.Arrays;
import java.util.List;
import java.util.function.Predicate;
import org.apache.lucene.index.Term;
@ -32,6 +33,15 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
* These sources implement minimum-interval algorithms taken from the paper
* <a href="http://vigna.di.unimi.it/ftp/papers/EfficientAlgorithmsMinimalIntervalSemantics.pdf">
* Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics</a>
*
* By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will
* rewrite their sub-sources so that disjunctions of different lengths are pulled up
* to the top of the interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c")
* will automatically rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c"))
* to ensure that documents containing "b c" are matched. This can lead to less efficient
* queries, as more terms need to be loaded (for example, the "c" iterator above is loaded
* twice), so if you care more about speed than about accuracy you can use the
* {@link #or(boolean, IntervalsSource...)} factory method to prevent rewriting.
*/
public final class Intervals {
@ -87,19 +97,54 @@ public final class Intervals {
* Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of IntervalsSources
*/
public static IntervalsSource phrase(IntervalsSource... subSources) {
if (subSources.length == 1) {
return subSources[0];
}
return new ConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.BLOCK);
return BlockIntervalsSource.build(Arrays.asList(subSources));
}
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* Automatically rewrites if wrapped by an interval source that is sensitive to
* internal gaps
*/
public static IntervalsSource or(IntervalsSource... subSources) {
return or(true, Arrays.asList(subSources));
}
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* @param rewrite if {@code false}, do not rewrite intervals that are sensitive to
* internal gaps; this may run more efficiently, but can miss valid
* hits due to minimization
* @param subSources the sources to combine
*/
public static IntervalsSource or(boolean rewrite, IntervalsSource... subSources) {
return or(rewrite, Arrays.asList(subSources));
}
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*/
public static IntervalsSource or(IntervalsSource... subSources) {
if (subSources.length == 1)
return subSources[0];
return new DisjunctionIntervalsSource(Arrays.asList(subSources));
public static IntervalsSource or(List<IntervalsSource> subSources) {
return or(true, subSources);
}
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* @param rewrite if {@code false}, do not rewrite intervals that are sensitive to
* internal gaps; this may run more efficiently, but can miss valid
* hits due to minimization
* @param subSources the sources to combine
*/
public static IntervalsSource or(boolean rewrite, List<IntervalsSource> subSources) {
if (subSources.size() == 1) {
return subSources.get(0);
}
if (rewrite) {
return new DisjunctionIntervalsSource(subSources);
}
return new NoRewriteDisjunctionIntervalsSource(subSources);
}
/**
@ -130,12 +175,7 @@ public final class Intervals {
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxwidth(int width, IntervalsSource subSource) {
return new FilteredIntervalsSource("MAXWIDTH/" + width, subSource) {
@Override
protected boolean accept(IntervalIterator it) {
return (it.end() - it.start()) + 1 <= width;
}
};
return FilteredIntervalsSource.maxWidth(subSource, width);
}
/**
@ -144,12 +184,7 @@ public final class Intervals {
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxgaps(int gaps, IntervalsSource subSource) {
return new FilteredIntervalsSource("MAXGAPS/" + gaps, subSource) {
@Override
protected boolean accept(IntervalIterator it) {
return it.gaps() <= gaps;
}
};
return FilteredIntervalsSource.maxGaps(subSource, gaps);
}
/**
@ -181,10 +216,7 @@ public final class Intervals {
* @param subSources an ordered set of {@link IntervalsSource} objects
*/
public static IntervalsSource ordered(IntervalsSource... subSources) {
if (subSources.length == 1) {
return subSources[0];
}
return new MinimizingConjunctionIntervalsSource(Arrays.asList(subSources), IntervalFunction.ORDERED);
return OrderedIntervalsSource.build(Arrays.asList(subSources));
}
/**
@ -207,11 +239,7 @@ public final class Intervals {
* @param allowOverlaps whether or not the sources should be allowed to overlap in a hit
*/
public static IntervalsSource unordered(boolean allowOverlaps, IntervalsSource... subSources) {
if (subSources.length == 1) {
return subSources[0];
}
return new MinimizingConjunctionIntervalsSource(Arrays.asList(subSources),
allowOverlaps ? IntervalFunction.UNORDERED : IntervalFunction.UNORDERED_NO_OVERLAP);
return UnorderedIntervalsSource.build(Arrays.asList(subSources), allowOverlaps);
}
/**
@ -234,7 +262,7 @@ public final class Intervals {
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource nonOverlapping(IntervalsSource minuend, IntervalsSource subtrahend) {
return new DifferenceIntervalsSource(minuend, subtrahend, DifferenceIntervalFunction.NON_OVERLAPPING);
return new NonOverlappingIntervalsSource(minuend, subtrahend);
}
/**
@ -243,7 +271,7 @@ public final class Intervals {
* @param reference the source to filter by
*/
public static IntervalsSource overlapping(IntervalsSource source, IntervalsSource reference) {
return new FilteringConjunctionIntervalsSource(source, reference, IntervalFunction.OVERLAPPING);
return new OverlappingIntervalsSource(source, reference);
}
/**
@ -258,8 +286,7 @@ public final class Intervals {
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notWithin(IntervalsSource minuend, int positions, IntervalsSource subtrahend) {
return new DifferenceIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions),
DifferenceIntervalFunction.NON_OVERLAPPING);
return new NonOverlappingIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions));
}
/**
@ -283,7 +310,7 @@ public final class Intervals {
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notContaining(IntervalsSource minuend, IntervalsSource subtrahend) {
return new DifferenceIntervalsSource(minuend, subtrahend, DifferenceIntervalFunction.NOT_CONTAINING);
return NotContainingIntervalsSource.build(minuend, subtrahend);
}
/**
@ -296,7 +323,7 @@ public final class Intervals {
* @param small the {@link IntervalsSource} to filter by
*/
public static IntervalsSource containing(IntervalsSource big, IntervalsSource small) {
return new FilteringConjunctionIntervalsSource(big, small, IntervalFunction.CONTAINING);
return ContainingIntervalsSource.build(big, small);
}
/**
@ -309,7 +336,7 @@ public final class Intervals {
* @param big the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notContainedBy(IntervalsSource small, IntervalsSource big) {
return new DifferenceIntervalsSource(small, big, DifferenceIntervalFunction.NOT_CONTAINED_BY);
return NotContainedByIntervalsSource.build(small, big);
}
/**
@ -321,7 +348,7 @@ public final class Intervals {
* @param big the {@link IntervalsSource} to filter by
*/
public static IntervalsSource containedBy(IntervalsSource small, IntervalsSource big) {
return new FilteringConjunctionIntervalsSource(small, big, IntervalFunction.CONTAINED_BY);
return ContainedByIntervalsSource.build(small, big);
}
/**
@ -335,18 +362,16 @@ public final class Intervals {
* Returns intervals from the source that appear before intervals from the reference
*/
public static IntervalsSource before(IntervalsSource source, IntervalsSource reference) {
return new FilteringConjunctionIntervalsSource(source,
Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0),
IntervalFunction.CONTAINED_BY);
return ContainedByIntervalsSource.build(source,
Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0));
}
/**
* Returns intervals from the source that appear after intervals from the reference
*/
public static IntervalsSource after(IntervalsSource source, IntervalsSource reference) {
return new FilteringConjunctionIntervalsSource(source,
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE),
IntervalFunction.CONTAINED_BY);
return ContainedByIntervalsSource.build(source,
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE));
}
}

View File

@ -18,6 +18,7 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
@ -64,6 +65,13 @@ public abstract class IntervalsSource {
*/
public abstract int minExtent();
/**
* Expert: return the set of disjunctions that make up this IntervalsSource
*
* Most implementations can return {@code Collections.singleton(this)}
*/
public abstract Collection<IntervalsSource> pullUpDisjunctions();
@Override
public abstract int hashCode();

View File

@ -1,133 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.MatchesUtils;
import org.apache.lucene.search.Query;
/**
* A ConjunctionIntervalsSource that attempts to minimize its internal intervals by
* eagerly advancing its first subinterval
*
* Uses caching to expose matches after its first subinterval has been moved on
*/
class MinimizingConjunctionIntervalsSource extends ConjunctionIntervalsSource {
MinimizingConjunctionIntervalsSource(List<IntervalsSource> subSources, IntervalFunction function) {
super(subSources, function);
}
@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
List<CachingMatchesIterator> subs = new ArrayList<>();
for (IntervalsSource source : subSources) {
MatchesIterator mi = source.matches(field, ctx, doc);
if (mi == null) {
return null;
}
subs.add(new CachingMatchesIterator(mi));
}
IntervalIterator it = function.apply(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList()));
if (it.advance(doc) != doc) {
return null;
}
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return null;
}
return new ConjunctionMatchesIterator(it, subs);
}
private static class ConjunctionMatchesIterator implements IntervalMatchesIterator {
final IntervalIterator iterator;
final List<CachingMatchesIterator> subs;
boolean cached = true;
private ConjunctionMatchesIterator(IntervalIterator iterator, List<CachingMatchesIterator> subs) {
this.iterator = iterator;
this.subs = subs;
}
@Override
public boolean next() throws IOException {
if (cached) {
cached = false;
return true;
}
return iterator.nextInterval() != IntervalIterator.NO_MORE_INTERVALS;
}
@Override
public int startPosition() {
return iterator.start();
}
@Override
public int endPosition() {
return iterator.end();
}
@Override
public int startOffset() throws IOException {
int start = Integer.MAX_VALUE;
int endPos = endPosition();
for (CachingMatchesIterator s : subs) {
start = Math.min(start, s.startOffset(endPos));
}
return start;
}
@Override
public int endOffset() throws IOException {
int end = 0;
int endPos = endPosition();
for (CachingMatchesIterator s : subs) {
end = Math.max(end, s.endOffset(endPos));
}
return end;
}
@Override
public int gaps() {
return iterator.gaps();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
List<MatchesIterator> mis = new ArrayList<>();
int endPos = endPosition();
for (CachingMatchesIterator s : subs) {
mis.add(s.getSubMatches(endPos));
}
return MatchesUtils.disjunction(mis);
}
@Override
public Query getQuery() {
return null;
}
}
}

View File

@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.MatchesUtils;
import org.apache.lucene.search.Query;
class MinimizingConjunctionMatchesIterator implements IntervalMatchesIterator {
final IntervalIterator iterator;
private final List<CachingMatchesIterator> subs = new ArrayList<>();
private boolean cached = true;
MinimizingConjunctionMatchesIterator(IntervalIterator iterator, List<MatchesIterator> subs) {
this.iterator = iterator;
for (MatchesIterator mi : subs) {
assert mi instanceof CachingMatchesIterator;
this.subs.add((CachingMatchesIterator)mi);
}
}
@Override
public boolean next() throws IOException {
if (cached) {
cached = false;
return true;
}
return iterator.nextInterval() != IntervalIterator.NO_MORE_INTERVALS;
}
@Override
public int startPosition() {
return iterator.start();
}
@Override
public int endPosition() {
return iterator.end();
}
@Override
public int startOffset() throws IOException {
int start = Integer.MAX_VALUE;
int endPos = endPosition();
for (CachingMatchesIterator s : subs) {
start = Math.min(start, s.startOffset(endPos));
}
return start;
}
@Override
public int endOffset() throws IOException {
int end = 0;
int endPos = endPosition();
for (CachingMatchesIterator s : subs) {
end = Math.max(end, s.endOffset(endPos));
}
return end;
}
@Override
public int gaps() {
return iterator.gaps();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
List<MatchesIterator> mis = new ArrayList<>();
int endPos = endPosition();
for (CachingMatchesIterator s : subs) {
mis.add(s.getSubMatches(endPos));
}
return MatchesUtils.disjunction(mis);
}
@Override
public Query getQuery() {
return null;
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
@ -107,6 +108,11 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {
return minExtent;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singleton(this);
}
@Override
public String toString() {
return "AtLeast("

View File

@ -19,6 +19,8 @@ package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
@ -97,6 +99,11 @@ class MultiTermIntervalsSource extends IntervalsSource {
return 1;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singleton(this);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -17,23 +17,17 @@
package org.apache.lucene.search.intervals;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
/**
* An intervals source that combines two other sources, requiring both of them to
* be present in order to match, but using the minExtent of one of them
*/
class FilteringConjunctionIntervalsSource extends ConjunctionIntervalsSource {
class NoRewriteDisjunctionIntervalsSource extends DisjunctionIntervalsSource {
private final IntervalsSource source;
FilteringConjunctionIntervalsSource(IntervalsSource source, IntervalsSource filter, IntervalFunction function) {
super(Arrays.asList(source, filter), function);
this.source = source;
public NoRewriteDisjunctionIntervalsSource(Collection<IntervalsSource> subSources) {
super(subSources);
}
@Override
public int minExtent() {
return source.minExtent();
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singletonList(this);
}
}

View File

@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
class NonOverlappingIntervalsSource extends DifferenceIntervalsSource {
NonOverlappingIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) {
super(minuend, subtrahend);
}
@Override
protected IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend) {
return new NonOverlappingIterator(minuend, subtrahend);
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singletonList(this);
}
@Override
public int hashCode() {
return Objects.hash(minuend, subtrahend);
}
@Override
public boolean equals(Object other) {
if (other instanceof NonOverlappingIntervalsSource == false) return false;
NonOverlappingIntervalsSource o = (NonOverlappingIntervalsSource) other;
return Objects.equals(this.minuend, o.minuend) && Objects.equals(this.subtrahend, o.subtrahend);
}
@Override
public String toString() {
return "NON_OVERLAPPING(" + minuend + "," + subtrahend + ")";
}
private static class NonOverlappingIterator extends RelativeIterator {
private NonOverlappingIterator(IntervalIterator minuend, IntervalIterator subtrahend) {
super(minuend, subtrahend);
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return a.nextInterval();
while (a.nextInterval() != NO_MORE_INTERVALS) {
while (b.end() < a.start()) {
if (b.nextInterval() == NO_MORE_INTERVALS) {
bpos = false;
return a.start();
}
}
if (b.start() > a.end())
return a.start();
}
return NO_MORE_INTERVALS;
}
}
}

View File

@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
class NotContainedByIntervalsSource extends DifferenceIntervalsSource {
static IntervalsSource build(IntervalsSource minuend, IntervalsSource subtrahend) {
return Intervals.or(Disjunctions.pullUp(subtrahend, s -> new NotContainedByIntervalsSource(minuend, s)));
}
private NotContainedByIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) {
super(minuend, subtrahend);
}
@Override
protected IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend) {
return new NotContainedByIterator(minuend, subtrahend);
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singletonList(this);
}
@Override
public int hashCode() {
return Objects.hash(minuend, subtrahend);
}
@Override
public boolean equals(Object other) {
if (other instanceof NotContainedByIntervalsSource == false) return false;
NotContainedByIntervalsSource o = (NotContainedByIntervalsSource) other;
return Objects.equals(this.minuend, o.minuend) && Objects.equals(this.subtrahend, o.subtrahend);
}
@Override
public String toString() {
return "NOT_CONTAINED_BY(" + minuend + "," + subtrahend + ")";
}
private static class NotContainedByIterator extends RelativeIterator {
NotContainedByIterator(IntervalIterator a, IntervalIterator b) {
super(a, b);
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return a.nextInterval();
while (a.nextInterval() != NO_MORE_INTERVALS) {
while (b.end() < a.end()) {
if (b.nextInterval() == NO_MORE_INTERVALS)
return a.start();
}
if (a.start() < b.start())
return a.start();
}
return NO_MORE_INTERVALS;
}
}
}

View File

@ -0,0 +1,86 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
class NotContainingIntervalsSource extends DifferenceIntervalsSource {
static IntervalsSource build(IntervalsSource minuend, IntervalsSource subtrahend) {
return Intervals.or(Disjunctions.pullUp(minuend, s -> new NotContainingIntervalsSource(s, subtrahend)));
}
private NotContainingIntervalsSource(IntervalsSource minuend, IntervalsSource subtrahend) {
super(minuend, subtrahend);
}
@Override
protected IntervalIterator combine(IntervalIterator minuend, IntervalIterator subtrahend) {
return new NotContainingIterator(minuend, subtrahend);
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singletonList(this);
}
@Override
public int hashCode() {
return Objects.hash(minuend, subtrahend);
}
@Override
public boolean equals(Object other) {
if (other instanceof NotContainingIntervalsSource == false) return false;
NotContainingIntervalsSource o = (NotContainingIntervalsSource) other;
return Objects.equals(this.minuend, o.minuend) && Objects.equals(this.subtrahend, o.subtrahend);
}
@Override
public String toString() {
return "NOT_CONTAINING(" + minuend + "," + subtrahend + ")";
}
private static class NotContainingIterator extends RelativeIterator {
private NotContainingIterator(IntervalIterator minuend, IntervalIterator subtrahend) {
super(minuend, subtrahend);
}
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return a.nextInterval();
while (a.nextInterval() != NO_MORE_INTERVALS) {
while (b.start() < a.start() && b.end() < a.end()) {
if (b.nextInterval() == NO_MORE_INTERVALS) {
bpos = false;
return a.start();
}
}
if (b.start() > a.end())
return a.start();
}
return NO_MORE_INTERVALS;
}
}
}

View File

@ -18,6 +18,8 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import org.apache.lucene.index.LeafReaderContext;
@ -152,6 +154,11 @@ class OffsetIntervalsSource extends IntervalsSource {
return 1;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singleton(this);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -0,0 +1,155 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
class OrderedIntervalsSource extends ConjunctionIntervalsSource {
static IntervalsSource build(List<IntervalsSource> sources) {
if (sources.size() == 1) {
return sources.get(0);
}
return new OrderedIntervalsSource(flatten(sources));
}
private static List<IntervalsSource> flatten(List<IntervalsSource> sources) {
List<IntervalsSource> flattened = new ArrayList<>();
for (IntervalsSource s : sources) {
if (s instanceof OrderedIntervalsSource) {
flattened.addAll(((OrderedIntervalsSource)s).subSources);
}
else {
flattened.add(s);
}
}
return flattened;
}
private OrderedIntervalsSource(List<IntervalsSource> sources) {
super(sources, true);
}
@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
return new OrderedIntervalIterator(iterators);
}
@Override
public int minExtent() {
int minExtent = 0;
for (IntervalsSource subSource : subSources) {
minExtent += subSource.minExtent();
}
return minExtent;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Disjunctions.pullUp(subSources, OrderedIntervalsSource::new);
}
@Override
public int hashCode() {
return Objects.hashCode(subSources);
}
@Override
public boolean equals(Object other) {
if (other instanceof OrderedIntervalsSource == false) return false;
OrderedIntervalsSource s = (OrderedIntervalsSource) other;
return Objects.equals(subSources, s.subSources);
}
@Override
public String toString() {
return "ORDERED(" + subSources.stream().map(IntervalsSource::toString).collect(Collectors.joining(",")) + ")";
}
private static class OrderedIntervalIterator extends ConjunctionIntervalIterator {
int start = -1, end = -1, i;
int firstEnd;
private OrderedIntervalIterator(List<IntervalIterator> subIntervals) {
super(subIntervals);
}
@Override
public int start() {
return start;
}
@Override
public int end() {
return end;
}
@Override
public int nextInterval() throws IOException {
start = end = IntervalIterator.NO_MORE_INTERVALS;
int b = Integer.MAX_VALUE;
i = 1;
while (true) {
while (true) {
if (subIterators.get(i - 1).end() >= b)
return start;
if (i == subIterators.size() || subIterators.get(i).start() > subIterators.get(i - 1).end())
break;
do {
if (subIterators.get(i).end() >= b || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start;
}
while (subIterators.get(i).start() <= subIterators.get(i - 1).end());
i++;
}
start = subIterators.get(0).start();
if (start == NO_MORE_INTERVALS) {
return end = NO_MORE_INTERVALS;
}
firstEnd = subIterators.get(0).end();
end = subIterators.get(subIterators.size() - 1).end();
b = subIterators.get(subIterators.size() - 1).start();
i = 1;
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start;
}
}
@Override
public int gaps() {
int gaps = subIterators.get(1).start() - firstEnd - 1;
for (int i = 2; i < subIterators.size(); i++) {
gaps += (subIterators.get(i).start() - subIterators.get(i - 1).end() - 1);
}
return gaps;
}
@Override
protected void reset() throws IOException {
subIterators.get(0).nextInterval();
i = 1;
start = end = firstEnd = -1;
}
}
}

View File

@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
class OverlappingIntervalsSource extends ConjunctionIntervalsSource {
private final IntervalsSource source;
private final IntervalsSource reference;
OverlappingIntervalsSource(IntervalsSource source, IntervalsSource reference) {
super(Arrays.asList(source, reference), false);
this.source = source;
this.reference = reference;
}
@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
assert iterators.size() == 2;
IntervalIterator a = iterators.get(0);
IntervalIterator b = iterators.get(1);
return new FilteringIntervalIterator(a, b) {
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.end() < a.start()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
}
if (b.start() <= a.end())
return a.start();
}
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
};
}
@Override
public int minExtent() {
return source.minExtent();
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Disjunctions.pullUp(Arrays.asList(source, reference), ss -> new OverlappingIntervalsSource(ss.get(0), ss.get(1)));
}
@Override
public int hashCode() {
return Objects.hash(this.subSources);
}
@Override
public boolean equals(Object other) {
if (other instanceof OverlappingIntervalsSource == false) return false;
OverlappingIntervalsSource o = (OverlappingIntervalsSource) other;
return Objects.equals(this.subSources, o.subSources);
}
@Override
public String toString() {
return "OVERLAPPING(" + source + "," + reference + ")";
}
}

View File

@ -18,6 +18,8 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import java.util.function.Predicate;
@ -224,6 +226,11 @@ class PayloadFilteredTermIntervalsSource extends IntervalsSource {
return 1;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singleton(this);
}
@Override
public int hashCode() {
return Objects.hash(term);

View File

@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
abstract class RelativeIterator extends IntervalIterator {
final IntervalIterator a;
final IntervalIterator b;
boolean bpos;
RelativeIterator(IntervalIterator a, IntervalIterator b) {
this.a = a;
this.b = b;
}
@Override
public int docID() {
return a.docID();
}
@Override
public int nextDoc() throws IOException {
int doc = a.nextDoc();
reset();
return doc;
}
@Override
public int advance(int target) throws IOException {
int doc = a.advance(target);
reset();
return doc;
}
@Override
public long cost() {
return a.cost();
}
protected void reset() throws IOException {
int doc = a.docID();
bpos = b.docID() == doc ||
(b.docID() < doc && b.advance(doc) == doc);
}
@Override
public int start() {
return a.start();
}
@Override
public int end() {
return a.end();
}
@Override
public int gaps() {
return a.gaps();
}
@Override
public float matchCost() {
return a.matchCost() + b.matchCost();
}
}

View File

@ -18,6 +18,8 @@
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
@ -208,6 +210,11 @@ class TermIntervalsSource extends IntervalsSource {
return 1;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singleton(this);
}
@Override
public int hashCode() {
return Objects.hash(term);

View File

@ -0,0 +1,240 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.lucene.util.PriorityQueue;
class UnorderedIntervalsSource extends ConjunctionIntervalsSource {
static IntervalsSource build(List<IntervalsSource> sources, boolean allowOverlaps) {
if (sources.size() == 1) {
return sources.get(0);
}
return new UnorderedIntervalsSource(flatten(sources, allowOverlaps), allowOverlaps);
}
private static List<IntervalsSource> flatten(List<IntervalsSource> sources, boolean allowOverlaps) {
List<IntervalsSource> flattened = new ArrayList<>();
for (IntervalsSource s : sources) {
if (s instanceof UnorderedIntervalsSource && ((UnorderedIntervalsSource)s).allowOverlaps == allowOverlaps) {
flattened.addAll(((UnorderedIntervalsSource)s).subSources);
}
else {
flattened.add(s);
}
}
return flattened;
}
private final boolean allowOverlaps;
private UnorderedIntervalsSource(List<IntervalsSource> sources, boolean allowOverlaps) {
super(sources, true);
this.allowOverlaps = allowOverlaps;
}
@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
return new UnorderedIntervalIterator(iterators, allowOverlaps);
}
@Override
public int minExtent() {
int minExtent = 0;
for (IntervalsSource subSource : subSources) {
minExtent += subSource.minExtent();
}
return minExtent;
}
@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Disjunctions.pullUp(subSources, ss -> new UnorderedIntervalsSource(ss, allowOverlaps));
}
@Override
public int hashCode() {
return Objects.hash(this.subSources, this.allowOverlaps);
}
@Override
public boolean equals(Object other) {
if (other instanceof UnorderedIntervalsSource == false) return false;
UnorderedIntervalsSource o = (UnorderedIntervalsSource) other;
return Objects.equals(this.subSources, o.subSources) &&
Objects.equals(this.allowOverlaps, o.allowOverlaps);
}
@Override
public String toString() {
return (allowOverlaps ? "UNORDERED(" : "UNORDERED_NO_OVERLAPS(") +
subSources.stream().map(IntervalsSource::toString).collect(Collectors.joining(",")) + ")";
}
private static class UnorderedIntervalIterator extends ConjunctionIntervalIterator {
private final PriorityQueue<IntervalIterator> queue;
private final IntervalIterator[] subIterators;
private final int[] innerPositions;
private final boolean allowOverlaps;
int start = -1, end = -1, firstEnd, queueEnd;
UnorderedIntervalIterator(List<IntervalIterator> subIterators, boolean allowOverlaps) {
super(subIterators);
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
@Override
protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end());
}
};
this.subIterators = new IntervalIterator[subIterators.size()];
this.innerPositions = new int[subIterators.size() * 2];
this.allowOverlaps = allowOverlaps;
for (int i = 0; i < subIterators.size(); i++) {
this.subIterators[i] = subIterators.get(i);
}
}
@Override
public int start() {
return start;
}
@Override
public int end() {
return end;
}
void updateRightExtreme(IntervalIterator it) {
int itEnd = it.end();
if (itEnd > queueEnd) {
queueEnd = itEnd;
}
}
@Override
public int nextInterval() throws IOException {
// first, find a matching interval
while (this.queue.size() == subIterators.length && queue.top().start() == start) {
IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
}
}
queue.add(it);
updateRightExtreme(it);
}
}
if (this.queue.size() < subIterators.length)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
// then, minimize it
do {
start = queue.top().start();
firstEnd = queue.top().end();
end = queueEnd;
if (queue.top().end() == end)
return start;
IntervalIterator it = queue.pop();
if (it != null && it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return start;
}
}
}
queue.add(it);
updateRightExtreme(it);
}
} while (this.queue.size() == subIterators.length && end == queueEnd);
return start;
}
@Override
public int gaps() {
for (int i = 0; i < subIterators.length; i++) {
if (subIterators[i].end() > end) {
innerPositions[i * 2] = start;
innerPositions[i * 2 + 1] = firstEnd;
}
else {
innerPositions[i * 2] = subIterators[i].start();
innerPositions[i * 2 + 1] = subIterators[i].end();
}
}
Arrays.sort(innerPositions);
int gaps = 0;
for (int i = 1; i < subIterators.length; i++) {
gaps += (innerPositions[i * 2] - innerPositions[i * 2 - 1] - 1);
}
return gaps;
}
@Override
protected void reset() throws IOException {
queueEnd = start = end = -1;
this.queue.clear();
loop: for (IntervalIterator it : subIterators) {
if (it.nextInterval() == NO_MORE_INTERVALS) {
break;
}
if (allowOverlaps == false) {
while (hasOverlaps(it)) {
if (it.nextInterval() == NO_MORE_INTERVALS) {
break loop;
}
}
}
queue.add(it);
updateRightExtreme(it);
}
}
private boolean hasOverlaps(IntervalIterator candidate) {
for (IntervalIterator it : queue) {
if (it.start() < candidate.start()) {
if (it.end() >= candidate.start()) {
return true;
}
continue;
}
if (it.start() == candidate.start()) {
return true;
}
if (it.start() <= candidate.end()) {
return true;
}
}
return false;
}
}
}

View File

@ -0,0 +1,215 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.intervals;
import org.apache.lucene.util.LuceneTestCase;
import static org.apache.lucene.search.intervals.Intervals.*;
public class TestDisjunctionRewrites extends LuceneTestCase {
public void testDisjunctionSuffix() {
// BLOCK(a,or(b, BLOCK(b, c))) => or(BLOCK(a, b), BLOCK(a, b, c))
IntervalsSource actual = Intervals.phrase(
Intervals.term("a"),
Intervals.or(Intervals.term("b"), Intervals.phrase("b", "c")));
IntervalsSource expected = Intervals.or(
Intervals.phrase("a", "b"),
Intervals.phrase("a", "b", "c"));
assertEquals(expected, actual);
}
public void testPhraseDisjunctionWithDifferentLengthClauses() {
// BLOCK(a, or(b, BLOCK(b, c)), d) => or(BLOCK(a, b, d), BLOCK(a, b, c, d))
IntervalsSource actual = Intervals.phrase(
Intervals.term("a"),
Intervals.or(Intervals.term("b"), Intervals.phrase(Intervals.term("b"), Intervals.term("c"))),
Intervals.term("d"));
IntervalsSource expected = Intervals.or(
Intervals.phrase("a", "b", "d"),
Intervals.phrase("a", "b", "c", "d")
);
assertEquals(expected, actual);
}
public void testPhraseDisjunctionWithNestedDifferentLengthClauses() {
// BLOCK(a, or(ORDERED(or(b, c), d), b, p, q), f, g)
// => or(BLOCK(a, or(b, p, q), f, g), BLOCK(a, ORDERED(or(b, c), d), f, g))
IntervalsSource expected = Intervals.or(
Intervals.phrase(
Intervals.term("a"),
Intervals.or(Intervals.term("b"), Intervals.term("p"), Intervals.term("q")),
Intervals.term("f"),
Intervals.term("g")),
Intervals.phrase(
Intervals.term("a"),
Intervals.ordered(Intervals.or(Intervals.term("b"), Intervals.term("c")), Intervals.term("d")),
Intervals.term("f"),
Intervals.term("g")
)
);
IntervalsSource actual = Intervals.phrase(
Intervals.term("a"),
Intervals.or(
Intervals.ordered(Intervals.or(Intervals.term("b"), Intervals.term("c")), Intervals.term("d")),
Intervals.term("b"),
Intervals.term("p"),
Intervals.term("q")
),
Intervals.term("f"),
Intervals.term("g")
);
assertEquals(expected, actual);
}
public void testDisjunctionRewritePreservesFilters() {
// BLOCK(a, MAXGAPS/3(OR(BLOCK(a, b), BLOCK(c, d))), c)
// => or(BLOCK(a, MAXGAPS/3(BLOCK(a, b)), c), BLOCK(a, MAXGAPS/3(BLOCK(c, d)), c))
IntervalsSource actual = Intervals.phrase(
Intervals.term("a"),
Intervals.maxgaps(3, Intervals.or(
Intervals.phrase("a", "b"),
Intervals.phrase("c", "d")
)),
Intervals.term("c")
);
IntervalsSource expected = Intervals.or(
Intervals.phrase(
Intervals.term("a"),
Intervals.maxgaps(3, Intervals.phrase("a", "b")),
Intervals.term("c")
),
Intervals.phrase(
Intervals.term("a"),
Intervals.maxgaps(3, Intervals.phrase("c", "d")),
Intervals.term("c")
));
assertEquals(expected, actual);
}
public void testNestedMaxGaps() {
// MAXGAPS/3(ORDERED(MAXGAPS/4(ORDERED(a, or(b, BLOCK(c, d)))), e)
// => or(MAXGAPS/3(ORDERED(MAXGAPS/4(ORDERED(a, b)), e)), MAXGAPS/3(ORDERED(MAXGAPS/4(ORDERED(a, BLOCK(c, d))), e)))
IntervalsSource actual = maxgaps(3, ordered(
maxgaps(4, ordered(term("a"), or(term("b"), phrase("c", "d")))), term("e")));
IntervalsSource expected = or(
maxgaps(3, ordered(maxgaps(4, ordered(term("a"), term("b"))), term("e"))),
maxgaps(3, ordered(maxgaps(4, ordered(term("a"), phrase("c", "d"))), term("e")))
);
assertEquals(expected, actual);
}
public void testNestedMaxWidth() {
// maxwidth does not automatically pull up disjunctions at construction time, so we need to check
// that it does the right thing if wrapped by something that does need exact internal accounting
// PHRASE(a, MAXWIDTH(4, OR(ORDERED(b, c), ORDERED(d, e))), f)
// => or(PHRASE(a, MAXWIDTH(4, ORDERED(b, c)), f), PHRASE(a, MAXWIDTH(4, ORDERED(d, e)), f))
IntervalsSource actual = phrase(term("a"), maxwidth(4, or(ordered(term("b"), term("c")), ordered(term("d"), term("e")))), term("f"));
IntervalsSource expected = or(
phrase(term("a"), maxwidth(4, ordered(term("b"), term("c"))), term("f")),
phrase(term("a"), maxwidth(4, ordered(term("d"), term("e"))), term("f"))
);
assertEquals(expected, actual);
}
public void testNestedFixField() {
// PHRASE(a, FIXFIELD(field, or(PHRASE(a, b), b)), c)
// => or(PHRASE(a, FIXFIELD(PHRASE(a, b)), c), PHRASE(a, FIXFIELD(b), c))
IntervalsSource actual = phrase(term("a"), fixField("field", or(phrase("a", "b"), term("b"))), term("c"));
IntervalsSource expected = or(
phrase(term("a"), fixField("field", phrase("a", "b")), term("c")),
phrase(term("a"), fixField("field", term("b")), term("c"))
);
assertEquals(expected, actual);
}
public void testContainedBy() {
// the 'big' interval should not be minimized, the 'small' one should be
// CONTAINED_BY(or("s", BLOCK("s", "t")), MAXGAPS/4(or(ORDERED("a", "b"), ORDERED("c", "d"))))
// => or(CONTAINED_BY(or("s", BLOCK("s", "t")), MAXGAPS/4(ORDERED("a", "b"))),
// CONTAINED_BY(or("s", BLOCK("s", "t")), MAXGAPS/4(ORDERED("c", "d"))))
IntervalsSource actual = containedBy(or(term("s"), phrase("s", "t")),
maxgaps(4, or(ordered(term("a"), term("b")), ordered(term("c"), term("d")))));
IntervalsSource expected = or(
containedBy(or(term("s"), phrase("s", "t")), maxgaps(4, ordered(term("a"), term("b")))),
containedBy(or(term("s"), phrase("s", "t")), maxgaps(4, ordered(term("c"), term("d"))))
);
assertEquals(expected, actual);
}
public void testContaining() {
// the 'big' interval should not be minimized, the 'small' one should be
// CONTAINING(MAXGAPS/4(or(ORDERED("a", "b"), ORDERED("c", "d"))), or("s", BLOCK("s", "t")))
// => or(CONTAINING(MAXGAPS/4(ORDERED("a", "b")), or("s", BLOCK("s", "t"))),
// CONTAINING(MAXGAPS/4(ORDERED("c", "d")), or("s", BLOCK("s", "t"))))
IntervalsSource actual = containing(maxgaps(4, or(ordered(term("a"), term("b")), ordered(term("c"), term("d")))),
or(term("s"), phrase("s", "t")));
IntervalsSource expected = or(
containing(maxgaps(4, ordered(term("a"), term("b"))), or(term("s"), phrase("s", "t"))),
containing(maxgaps(4, ordered(term("c"), term("d"))), or(term("s"), phrase("s", "t")))
);
assertEquals(expected, actual);
}
public void testNotContainedBy() {
// the 'big' interval should not be minimized, the 'small' one should be
// NOT_CONTAINED_BY(or(BLOCK("a", "b"), "a"), or(BLOCK("c", "d"), "d"))
// => or(NOT_CONTAINED_BY(or(BLOCK("a", "b"), "a"), BLOCK("c", "d"))), NOT_CONTAINED_BY(or(BLOCK("a", "b"), "a"), "d"))
IntervalsSource actual = notContainedBy(or(phrase("a", "b"), term("a")), or(phrase("c", "d"), term("d")));
IntervalsSource expected = or(
notContainedBy(or(phrase("a", "b"), term("a")), phrase("c", "d")),
notContainedBy(or(phrase("a", "b"), term("a")), term("d"))
);
assertEquals(expected, actual);
}
public void testNotContaining() {
// the 'big' interval should not be minimized, the 'small' one should be
// NOT_CONTAINING(or(BLOCK("a", "b"), "a"), or(BLOCK("c", "d"), "d"))
// => or(NOT_CONTAINING(BLOCK("a", "b"), or(BLOCK("c", "d"), "d")), NOT_CONTAINING("a", or(BLOCK("c", "d"), "d")))
IntervalsSource actual = notContaining(or(phrase("a", "b"), term("a")), or(phrase("c", "d"), term("d")));
IntervalsSource expected = or(
notContaining(phrase("a", "b"), or(phrase("c", "d"), term("d"))),
notContaining(term("a"), or(phrase("c", "d"), term("d")))
);
assertEquals(expected, actual);
}
public void testBlockedRewrites() {
IntervalsSource actual = phrase(term("a"), or(false, phrase("b", "c"), term("c")));
IntervalsSource ifRewritten = or(phrase("a", "b", "c"), phrase("a", "c"));
assertNotEquals(ifRewritten, actual);
}
}

View File

@ -31,7 +31,6 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Ignore;
public class TestIntervalQuery extends LuceneTestCase {
@ -71,7 +70,8 @@ public class TestIntervalQuery extends LuceneTestCase {
"w2 w1",
"w2 w1 w3 w2 w4",
"coordinate genome mapping research",
"coordinate genome research"
"coordinate genome research",
"greater new york"
};
private void checkHits(Query query, int[] results) throws IOException {
@ -166,10 +166,6 @@ public class TestIntervalQuery extends LuceneTestCase {
checkHits(q, new int[]{});
}
// The Vigna paper doesn't deal with prefix disjunctions. For now, we keep the same
// logic as detailed in the paper, but we may want to address it in future so that tests
// like the one below will pass
@Ignore
public void testNestedOr() throws IOException {
Query q = new IntervalQuery(field, Intervals.phrase(
Intervals.term("coordinate"),
@ -178,6 +174,39 @@ public class TestIntervalQuery extends LuceneTestCase {
checkHits(q, new int[]{ 6, 7 });
}
public void testNestedOrWithGaps() throws IOException {
Query q = new IntervalQuery(field, Intervals.phrase(
Intervals.term("coordinate"),
Intervals.or(Intervals.term("genome"), Intervals.extend(Intervals.term("mapping"), 1, 0)),
Intervals.term("research")));
checkHits(q, new int[]{ 6, 7 });
}
public void testNestedOrWithinDifference() throws IOException {
Query q = new IntervalQuery(field, Intervals.phrase(
Intervals.term("coordinate"),
Intervals.notContaining(
Intervals.or(Intervals.phrase("genome", "mapping"), Intervals.term("genome")),
Intervals.term("wibble")),
Intervals.term("research")));
checkHits(q, new int[]{ 6, 7 });
}
public void testNestedOrWithinConjunctionFilter() throws IOException {
Query q = new IntervalQuery(field, Intervals.phrase(
Intervals.term("coordinate"),
Intervals.containing(
Intervals.or(Intervals.phrase("genome", "mapping"), Intervals.term("genome")),
Intervals.term("genome")),
Intervals.term("research")));
checkHits(q, new int[]{ 6, 7 });
q = new IntervalQuery(field, Intervals.phrase(
Intervals.term("greater"),
Intervals.or(Intervals.phrase("new", "york"), Intervals.term("york"))));
checkHits(q, new int[]{ 8 });
}
public void testUnordered() throws IOException {
Query q = new IntervalQuery(field,
Intervals.unordered(
@ -191,6 +220,21 @@ public class TestIntervalQuery extends LuceneTestCase {
checkHits(q, new int[]{3});
}
public void testNestedOrInUnorderedMaxGaps() throws IOException {
Query q = new IntervalQuery(field, Intervals.maxgaps(1, Intervals.unordered(
Intervals.or(Intervals.term("coordinate"), Intervals.phrase("coordinate", "genome")),
Intervals.term("research"))
));
checkHits(q, new int[]{ 6, 7 });
}
public void testNestedOrInContainedBy() throws IOException {
Query q = new IntervalQuery(field, Intervals.containedBy(
Intervals.term("genome"),
Intervals.or(Intervals.term("coordinate"), Intervals.ordered(Intervals.term("coordinate"), Intervals.term("research")))));
checkHits(q, new int[]{ 6, 7 });
}
public void testDefinedGaps() throws IOException {
Query q = new IntervalQuery(field,
Intervals.phrase(Intervals.term("w1"), Intervals.extend(Intervals.term("w2"), 1, 0)));

View File

@ -118,12 +118,12 @@ public class TestIntervals extends LuceneTestCase {
if (i >= expected[id].length) {
fail("Unexpected match in doc " + id + ": " + intervals);
}
assertEquals("Wrong start value in doc " + id, expected[id][i], pos);
assertEquals(source + ": wrong start value in doc " + id, expected[id][i], pos);
assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start());
assertEquals("Wrong end value in doc " + id, expected[id][i + 1], intervals.end());
i += 2;
}
assertEquals("Wrong number of endpoints in doc " + id, expected[id].length, i);
assertEquals(source + ": wrong number of endpoints in doc " + id, expected[id].length, i);
assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.start());
assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.end());
if (i > 0)
@ -762,4 +762,22 @@ public class TestIntervals extends LuceneTestCase {
assertMatch(mi, 17, 17, 97, 100);
}
public void testWrappedFilters() throws IOException {
IntervalsSource source = Intervals.or(
Intervals.term("nine"),
Intervals.maxgaps(1, Intervals.or(
Intervals.ordered(Intervals.term("pease"), Intervals.term("hot")),
Intervals.ordered(Intervals.term("pease"), Intervals.term("cold")))));
checkIntervals(source, "field1", 3, new int[][]{
{},
{ 0, 2, 3, 5, 11, 11, 28, 28 },
{ 0, 2, 3, 5 },
{},
{ 0, 2, 3, 5, 11, 11 },
{}
});
}
}

View File

@ -22,6 +22,7 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestSimplifications extends LuceneTestCase {
public void testStringPhrases() {
// BLOCK(term) => term
IntervalsSource actual = Intervals.phrase("term");
assertEquals(Intervals.term("term"), actual);
}
@ -32,18 +33,45 @@ public class TestSimplifications extends LuceneTestCase {
}
public void testOrdered() {
// ORDERED(term) => term
IntervalsSource actual = Intervals.ordered(Intervals.term("term"));
assertEquals(Intervals.term("term"), actual);
}
public void testUnordered() {
// UNORDERED(term) => term
IntervalsSource actual = Intervals.unordered(Intervals.term("term"));
assertEquals(Intervals.term("term"), actual);
}
public void testUnorderedOverlaps() {
IntervalsSource actual = Intervals.unordered(true, Intervals.term("term"));
// UNORDERED_NO_OVERLAPS(term) => term
IntervalsSource actual = Intervals.unordered(false, Intervals.term("term"));
assertEquals(Intervals.term("term"), actual);
}
public void testDisjunctionRemovesDuplicates() {
// or(a, b, a) => or(a, b)
IntervalsSource actual = Intervals.or(Intervals.term("a"), Intervals.term("b"), Intervals.term("a"));
assertEquals(Intervals.or(Intervals.term("a"), Intervals.term("b")), actual);
}
public void testPhraseSimplification() {
// BLOCK(BLOCK(a, b), c) => BLOCK(a, b, c)
IntervalsSource actual = Intervals.phrase(Intervals.phrase(Intervals.term("a"), Intervals.term("b")), Intervals.term("c"));
assertEquals(Intervals.phrase(Intervals.term("a"), Intervals.term("b"), Intervals.term("c")), actual);
// BLOCK(a, BLOCK(b, BLOCK(c, d))) => BLOCK(a, b, c, d)
actual = Intervals.phrase(Intervals.term("a"), Intervals.phrase(Intervals.term("b"),
Intervals.phrase(Intervals.term("c"), Intervals.term("d"))));
assertEquals(Intervals.phrase(Intervals.term("a"), Intervals.term("b"), Intervals.term("c"), Intervals.term("d")), actual);
}
public void testDisjunctionSimplification() {
// or(a, or(b, or(c, d))) => or(a, b, c, d)
IntervalsSource actual = Intervals.or(Intervals.term("a"), Intervals.or(Intervals.term("b"),
Intervals.or(Intervals.term("c"), Intervals.term("d"))));
assertEquals(Intervals.or(Intervals.term("a"), Intervals.term("b"), Intervals.term("c"), Intervals.term("d")), actual);
}
}