Leverage doc value skip lists in DocValuesRewriteMethod if indexed (#13672)

This commit is contained in:
Greg Miller 2024-08-26 10:04:12 -07:00 committed by GitHub
parent 4e3945ed54
commit 68882c8b89
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 479 additions and 302 deletions

View File

@ -376,6 +376,8 @@ Optimizations
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
Changes in runtime behavior
---------------------

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesRangeIterator;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
@ -179,7 +180,7 @@ final class SortedNumericDocValuesRangeQuery extends Query {
};
}
if (skipper != null) {
iterator = new DocValuesRangeIterator(iterator, skipper, lowerValue, upperValue);
iterator = new DocValuesRangeIterator(iterator, skipper, lowerValue, upperValue, false);
}
final var scorer = new ConstantScoreScorer(score(), scoreMode, iterator);
return new DefaultScorerSupplier(scorer);

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesRangeIterator;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -217,7 +218,7 @@ final class SortedSetDocValuesRangeQuery extends Query {
};
}
if (skipper != null) {
iterator = new DocValuesRangeIterator(iterator, skipper, minOrd, maxOrd);
iterator = new DocValuesRangeIterator(iterator, skipper, minOrd, maxOrd, false);
}
return new ConstantScoreScorer(score(), scoreMode, iterator);
}

View File

@ -14,18 +14,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TwoPhaseIterator;
/**
* Wrapper around a {@link TwoPhaseIterator} for a doc-values range query that speeds things up by
* taking advantage of a {@link DocValuesSkipper}.
*
* @lucene.experimental
*/
final class DocValuesRangeIterator extends TwoPhaseIterator {
public final class DocValuesRangeIterator extends TwoPhaseIterator {
enum Match {
/** None of the documents in the range match */
@ -41,19 +41,29 @@ final class DocValuesRangeIterator extends TwoPhaseIterator {
private final Approximation approximation;
private final TwoPhaseIterator innerTwoPhase;
DocValuesRangeIterator(
TwoPhaseIterator twoPhase, DocValuesSkipper skipper, long lowerValue, long upperValue) {
super(new Approximation(twoPhase.approximation(), skipper, lowerValue, upperValue));
public DocValuesRangeIterator(
TwoPhaseIterator twoPhase,
DocValuesSkipper skipper,
long lowerValue,
long upperValue,
boolean queryRangeHasGaps) {
super(
queryRangeHasGaps
? new RangeWithGapsApproximation(
twoPhase.approximation(), skipper, lowerValue, upperValue)
: new RangeNoGapsApproximation(
twoPhase.approximation(), skipper, lowerValue, upperValue));
this.approximation = (Approximation) approximation();
this.innerTwoPhase = twoPhase;
}
static class Approximation extends DocIdSetIterator {
abstract static class Approximation extends DocIdSetIterator {
private final DocIdSetIterator innerApproximation;
private final DocValuesSkipper skipper;
private final long lowerValue;
private final long upperValue;
protected final DocValuesSkipper skipper;
protected final long lowerValue;
protected final long upperValue;
private int doc = -1;
@ -137,7 +147,21 @@ final class DocValuesRangeIterator extends TwoPhaseIterator {
return innerApproximation.cost();
}
private Match match(int level) {
protected abstract Match match(int level);
}
private static final class RangeNoGapsApproximation extends Approximation {
RangeNoGapsApproximation(
DocIdSetIterator innerApproximation,
DocValuesSkipper skipper,
long lowerValue,
long upperValue) {
super(innerApproximation, skipper, lowerValue, upperValue);
}
@Override
protected Match match(int level) {
long minValue = skipper.minValue(level);
long maxValue = skipper.maxValue(level);
if (minValue > upperValue || maxValue < lowerValue) {
@ -154,6 +178,28 @@ final class DocValuesRangeIterator extends TwoPhaseIterator {
}
}
private static final class RangeWithGapsApproximation extends Approximation {
RangeWithGapsApproximation(
DocIdSetIterator innerApproximation,
DocValuesSkipper skipper,
long lowerValue,
long upperValue) {
super(innerApproximation, skipper, lowerValue, upperValue);
}
@Override
protected Match match(int level) {
long minValue = skipper.minValue(level);
long maxValue = skipper.maxValue(level);
if (minValue > upperValue || maxValue < lowerValue) {
return Match.NO;
} else {
return Match.MAYBE;
}
}
}
@Override
public final boolean matches() throws IOException {
return switch (approximation.match) {

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
@ -166,27 +167,29 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty());
}
// Leverage a DV skipper if one was indexed for the field:
DocValuesSkipper skipper = context.reader().getDocValuesSkipper(query.field);
// Create a bit set for the "term set" ordinals (these are the terms provided by the
// query that are actually present in the doc values field). Cannot use FixedBitSet
// because we require long index (ord):
final LongBitSet termSet = new LongBitSet(values.getValueCount());
long minOrd = termsEnum.ord();
assert minOrd >= 0;
long maxOrd = -1;
do {
long ord = termsEnum.ord();
if (ord >= 0) {
assert ord > maxOrd;
maxOrd = ord;
termSet.set(ord);
}
assert ord >= 0 && ord > maxOrd;
maxOrd = ord;
termSet.set(ord);
} while (termsEnum.next() != null);
// no terms matched in this segment
if (maxOrd < 0) {
if (skipper != null && (minOrd > skipper.maxValue() || maxOrd < skipper.minValue())) {
return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty());
}
final SortedDocValues singleton = DocValues.unwrapSingleton(values);
final TwoPhaseIterator iterator;
TwoPhaseIterator iterator;
final long max = maxOrd;
if (singleton != null) {
iterator =
@ -224,6 +227,9 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
};
}
if (skipper != null) {
iterator = new DocValuesRangeIterator(iterator, skipper, minOrd, maxOrd, true);
}
return new ConstantScoreScorer(score(), scoreMode, iterator);
}

View File

@ -1,273 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.tests.util.LuceneTestCase;
public class TestDocValuesRangeIterator extends LuceneTestCase {
public void testSingleLevel() throws IOException {
doTestBasics(false);
}
public void testMultipleLevels() throws IOException {
doTestBasics(true);
}
private void doTestBasics(boolean doLevels) throws IOException {
long queryMin = 10;
long queryMax = 20;
// Fake numeric doc values so that:
// docs 0-256 all match
// docs in 256-512 are all greater than queryMax
// docs in 512-768 are all less than queryMin
// docs in 768-1024 have some docs that match the range, others not
// docs in 1024-2048 follow a similar pattern as docs in 0-1024 except that not all docs have a
// value
NumericDocValues values =
new NumericDocValues() {
int doc = -1;
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
if (target < 1024) {
// dense up to 1024
return doc = target;
} else if (doc < 2047) {
// 50% docs have a value up to 2048
return doc = target + (target & 1);
} else {
return doc = DocIdSetIterator.NO_MORE_DOCS;
}
}
@Override
public long longValue() throws IOException {
int d = doc % 1024;
if (d < 128) {
return (queryMin + queryMax) >> 1;
} else if (d < 256) {
return queryMax + 1;
} else if (d < 512) {
return queryMin - 1;
} else {
return switch ((d / 2) % 3) {
case 0 -> queryMin - 1;
case 1 -> queryMax + 1;
case 2 -> (queryMin + queryMax) >> 1;
default -> throw new AssertionError();
};
}
}
@Override
public long cost() {
return 42;
}
};
AtomicBoolean twoPhaseCalled = new AtomicBoolean();
TwoPhaseIterator twoPhase =
new TwoPhaseIterator(values) {
@Override
public boolean matches() throws IOException {
twoPhaseCalled.set(true);
long v = values.longValue();
return v >= queryMin && v <= queryMax;
}
@Override
public float matchCost() {
return 2f; // 2 comparisons
}
};
DocValuesSkipper skipper =
new DocValuesSkipper() {
int doc = -1;
@Override
public void advance(int target) throws IOException {
doc = target;
}
@Override
public int numLevels() {
return doLevels ? 3 : 1;
}
@Override
public int minDocID(int level) {
int rangeLog = 9 - numLevels() + level;
// the level is the log2 of the interval
if (doc < 0) {
return -1;
} else if (doc >= 2048) {
return DocIdSetIterator.NO_MORE_DOCS;
} else {
int mask = (1 << rangeLog) - 1;
// prior multiple of 2^level
return doc & ~mask;
}
}
@Override
public int maxDocID(int level) {
int rangeLog = 9 - numLevels() + level;
int minDocID = minDocID(level);
return switch (minDocID) {
case -1 -> -1;
case DocIdSetIterator.NO_MORE_DOCS -> DocIdSetIterator.NO_MORE_DOCS;
default -> minDocID + (1 << rangeLog) - 1;
};
}
@Override
public long minValue(int level) {
int d = doc % 1024;
if (d < 128) {
return queryMin;
} else if (d < 256) {
return queryMax + 1;
} else if (d < 768) {
return queryMin - 1;
} else {
return queryMin - 1;
}
}
@Override
public long maxValue(int level) {
int d = doc % 1024;
if (d < 128) {
return queryMax;
} else if (d < 256) {
return queryMax + 1;
} else if (d < 768) {
return queryMin - 1;
} else {
return queryMax + 1;
}
}
@Override
public int docCount(int level) {
int rangeLog = 9 - numLevels() + level;
if (doc < 1024) {
return 1 << rangeLog;
} else {
// half docs have a value
return 1 << rangeLog >> 1;
}
}
@Override
public long minValue() {
return Long.MIN_VALUE;
}
@Override
public long maxValue() {
return Long.MAX_VALUE;
}
@Override
public int docCount() {
return 1024 + 1024 / 2;
}
};
DocValuesRangeIterator rangeIterator =
new DocValuesRangeIterator(twoPhase, skipper, queryMin, queryMax);
DocValuesRangeIterator.Approximation rangeApproximation =
(DocValuesRangeIterator.Approximation) rangeIterator.approximation();
assertEquals(100, rangeApproximation.advance(100));
assertEquals(DocValuesRangeIterator.Match.YES, rangeApproximation.match);
assertEquals(255, rangeApproximation.upTo);
assertTrue(rangeIterator.matches());
assertTrue(values.docID() < rangeApproximation.docID()); // we did not advance doc values
assertFalse(twoPhaseCalled.get());
assertEquals(768, rangeApproximation.advance(300));
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match);
if (doLevels) {
assertEquals(831, rangeApproximation.upTo);
} else {
assertEquals(1023, rangeApproximation.upTo);
}
for (int i = 0; i < 10; ++i) {
assertEquals(values.docID(), rangeApproximation.docID());
assertEquals(twoPhase.matches(), rangeIterator.matches());
assertTrue(twoPhaseCalled.get());
twoPhaseCalled.set(false);
rangeApproximation.nextDoc();
}
assertEquals(1100, rangeApproximation.advance(1099));
assertEquals(DocValuesRangeIterator.Match.IF_DOC_HAS_VALUE, rangeApproximation.match);
assertEquals(1024 + 256 - 1, rangeApproximation.upTo);
assertEquals(values.docID(), rangeApproximation.docID());
assertTrue(rangeIterator.matches());
assertFalse(twoPhaseCalled.get());
assertEquals(1024 + 768, rangeApproximation.advance(1024 + 300));
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match);
if (doLevels) {
assertEquals(1024 + 831, rangeApproximation.upTo);
} else {
assertEquals(2047, rangeApproximation.upTo);
}
for (int i = 0; i < 10; ++i) {
assertEquals(values.docID(), rangeApproximation.docID());
assertEquals(twoPhase.matches(), rangeIterator.matches());
assertTrue(twoPhaseCalled.get());
twoPhaseCalled.set(false);
rangeApproximation.nextDoc();
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, rangeApproximation.advance(2048));
}
}

View File

@ -0,0 +1,332 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.tests.util.LuceneTestCase;
public class TestDocValuesRangeIterator extends LuceneTestCase {
public void testSingleLevel() throws IOException {
doTestBasics(false);
}
public void testMultipleLevels() throws IOException {
doTestBasics(true);
}
private void doTestBasics(boolean doLevels) throws IOException {
long queryMin = 10;
long queryMax = 20;
// Test with both gaps and no-gaps in the ranges:
NumericDocValues values = docValues(queryMin, queryMax);
NumericDocValues values2 = docValues(queryMin, queryMax);
AtomicBoolean twoPhaseCalled = new AtomicBoolean();
TwoPhaseIterator twoPhase = twoPhaseIterator(values, queryMin, queryMax, twoPhaseCalled);
AtomicBoolean twoPhaseCalled2 = new AtomicBoolean();
TwoPhaseIterator twoPhase2 = twoPhaseIterator(values2, queryMin, queryMax, twoPhaseCalled2);
DocValuesSkipper skipper = docValuesSkipper(queryMin, queryMax, doLevels);
DocValuesSkipper skipper2 = docValuesSkipper(queryMin, queryMax, doLevels);
DocValuesRangeIterator rangeIterator =
new DocValuesRangeIterator(twoPhase, skipper, queryMin, queryMax, false);
DocValuesRangeIterator rangeIteratorWithGaps =
new DocValuesRangeIterator(twoPhase2, skipper2, queryMin, queryMax, true);
DocValuesRangeIterator.Approximation rangeApproximation =
(DocValuesRangeIterator.Approximation) rangeIterator.approximation();
DocValuesRangeIterator.Approximation rangeApproximationWithGaps =
(DocValuesRangeIterator.Approximation) rangeIteratorWithGaps.approximation();
assertEquals(100, rangeApproximation.advance(100));
assertEquals(100, rangeApproximationWithGaps.advance(100));
assertEquals(DocValuesRangeIterator.Match.YES, rangeApproximation.match);
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match);
assertEquals(255, rangeApproximation.upTo);
if (doLevels) {
assertEquals(127, rangeApproximationWithGaps.upTo);
} else {
assertEquals(255, rangeApproximationWithGaps.upTo);
}
assertTrue(rangeIterator.matches());
assertTrue(rangeIteratorWithGaps.matches());
assertTrue(values.docID() < rangeApproximation.docID()); // we did not advance doc values
assertEquals(
values2.docID(), rangeApproximationWithGaps.docID()); // we _did_ advance doc values
assertFalse(twoPhaseCalled.get());
assertTrue(twoPhaseCalled2.get());
twoPhaseCalled2.set(false);
assertEquals(768, rangeApproximation.advance(300));
assertEquals(768, rangeApproximationWithGaps.advance(300));
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match);
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match);
if (doLevels) {
assertEquals(831, rangeApproximation.upTo);
assertEquals(831, rangeApproximationWithGaps.upTo);
} else {
assertEquals(1023, rangeApproximation.upTo);
assertEquals(1023, rangeApproximationWithGaps.upTo);
}
for (int i = 0; i < 10; ++i) {
assertEquals(values.docID(), rangeApproximation.docID());
assertEquals(values2.docID(), rangeApproximationWithGaps.docID());
assertEquals(twoPhase.matches(), rangeIterator.matches());
assertEquals(twoPhase2.matches(), rangeIteratorWithGaps.matches());
assertTrue(twoPhaseCalled.get());
assertTrue(twoPhaseCalled2.get());
twoPhaseCalled.set(false);
twoPhaseCalled2.set(false);
rangeApproximation.nextDoc();
rangeApproximationWithGaps.nextDoc();
}
assertEquals(1100, rangeApproximation.advance(1099));
assertEquals(1100, rangeApproximationWithGaps.advance(1099));
assertEquals(DocValuesRangeIterator.Match.IF_DOC_HAS_VALUE, rangeApproximation.match);
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match);
assertEquals(1024 + 256 - 1, rangeApproximation.upTo);
if (doLevels) {
assertEquals(1024 + 128 - 1, rangeApproximationWithGaps.upTo);
} else {
assertEquals(1024 + 256 - 1, rangeApproximationWithGaps.upTo);
}
assertEquals(values.docID(), rangeApproximation.docID());
assertEquals(values2.docID(), rangeApproximationWithGaps.docID());
assertTrue(rangeIterator.matches());
assertTrue(rangeIteratorWithGaps.matches());
assertFalse(twoPhaseCalled.get());
assertTrue(twoPhaseCalled2.get());
twoPhaseCalled2.set(false);
assertEquals(1024 + 768, rangeApproximation.advance(1024 + 300));
assertEquals(1024 + 768, rangeApproximationWithGaps.advance(1024 + 300));
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match);
assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximationWithGaps.match);
if (doLevels) {
assertEquals(1024 + 831, rangeApproximation.upTo);
assertEquals(1024 + 831, rangeApproximationWithGaps.upTo);
} else {
assertEquals(2047, rangeApproximation.upTo);
assertEquals(2047, rangeApproximationWithGaps.upTo);
}
for (int i = 0; i < 10; ++i) {
assertEquals(values.docID(), rangeApproximation.docID());
assertEquals(values2.docID(), rangeApproximationWithGaps.docID());
assertEquals(twoPhase.matches(), rangeIterator.matches());
assertEquals(twoPhase2.matches(), rangeIteratorWithGaps.matches());
assertTrue(twoPhaseCalled.get());
assertTrue(twoPhaseCalled2.get());
twoPhaseCalled.set(false);
twoPhaseCalled2.set(false);
rangeApproximation.nextDoc();
rangeApproximationWithGaps.nextDoc();
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, rangeApproximation.advance(2048));
assertEquals(DocIdSetIterator.NO_MORE_DOCS, rangeApproximationWithGaps.advance(2048));
}
// Fake numeric doc values so that:
// docs 0-256 all match
// docs in 256-512 are all greater than queryMax
// docs in 512-768 are all less than queryMin
// docs in 768-1024 have some docs that match the range, others not
// docs in 1024-2048 follow a similar pattern as docs in 0-1024 except that not all docs have a
// value
private static NumericDocValues docValues(long queryMin, long queryMax) {
return new NumericDocValues() {
int doc = -1;
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
if (target < 1024) {
// dense up to 1024
return doc = target;
} else if (doc < 2047) {
// 50% docs have a value up to 2048
return doc = target + (target & 1);
} else {
return doc = DocIdSetIterator.NO_MORE_DOCS;
}
}
@Override
public long longValue() throws IOException {
int d = doc % 1024;
if (d < 128) {
return (queryMin + queryMax) >> 1;
} else if (d < 256) {
return queryMax + 1;
} else if (d < 512) {
return queryMin - 1;
} else {
return switch ((d / 2) % 3) {
case 0 -> queryMin - 1;
case 1 -> queryMax + 1;
case 2 -> (queryMin + queryMax) >> 1;
default -> throw new AssertionError();
};
}
}
@Override
public long cost() {
return 42;
}
};
}
private static TwoPhaseIterator twoPhaseIterator(
NumericDocValues values, long queryMin, long queryMax, AtomicBoolean twoPhaseCalled) {
return new TwoPhaseIterator(values) {
@Override
public boolean matches() throws IOException {
twoPhaseCalled.set(true);
long v = values.longValue();
return v >= queryMin && v <= queryMax;
}
@Override
public float matchCost() {
return 2f; // 2 comparisons
}
};
}
private static DocValuesSkipper docValuesSkipper(long queryMin, long queryMax, boolean doLevels) {
return new DocValuesSkipper() {
int doc = -1;
@Override
public void advance(int target) throws IOException {
doc = target;
}
@Override
public int numLevels() {
return doLevels ? 3 : 1;
}
@Override
public int minDocID(int level) {
int rangeLog = 9 - numLevels() + level;
// the level is the log2 of the interval
if (doc < 0) {
return -1;
} else if (doc >= 2048) {
return DocIdSetIterator.NO_MORE_DOCS;
} else {
int mask = (1 << rangeLog) - 1;
// prior multiple of 2^level
return doc & ~mask;
}
}
@Override
public int maxDocID(int level) {
int rangeLog = 9 - numLevels() + level;
int minDocID = minDocID(level);
return switch (minDocID) {
case -1 -> -1;
case DocIdSetIterator.NO_MORE_DOCS -> DocIdSetIterator.NO_MORE_DOCS;
default -> minDocID + (1 << rangeLog) - 1;
};
}
@Override
public long minValue(int level) {
int d = doc % 1024;
if (d < 128) {
return queryMin;
} else if (d < 256) {
return queryMax + 1;
} else if (d < 768) {
return queryMin - 1;
} else {
return queryMin - 1;
}
}
@Override
public long maxValue(int level) {
int d = doc % 1024;
if (d < 128) {
return queryMax;
} else if (d < 256) {
return queryMax + 1;
} else if (d < 768) {
return queryMin - 1;
} else {
return queryMax + 1;
}
}
@Override
public int docCount(int level) {
int rangeLog = 9 - numLevels() + level;
if (doc < 1024) {
return 1 << rangeLog;
} else {
// half docs have a value
return 1 << rangeLog >> 1;
}
}
@Override
public long minValue() {
return Long.MIN_VALUE;
}
@Override
public long maxValue() {
return Long.MAX_VALUE;
}
@Override
public int docCount() {
return 1024 + 1024 / 2;
}
};
}
}

View File

@ -41,8 +41,7 @@ import org.apache.lucene.util.automaton.RegExp;
/** Tests the DocValuesRewriteMethod */
public class TestDocValuesRewriteMethod extends LuceneTestCase {
protected IndexSearcher searcher1;
protected IndexSearcher searcher2;
protected IndexSearcher searcher;
private IndexReader reader;
private Directory dir;
protected String fieldName;
@ -69,6 +68,7 @@ public class TestDocValuesRewriteMethod extends LuceneTestCase {
String s = TestUtil.randomUnicodeString(random());
doc.add(newStringField(fieldName, s, Field.Store.NO));
doc.add(new SortedSetDocValuesField(fieldName, new BytesRef(s)));
doc.add(SortedSetDocValuesField.indexedField(fieldName + "_with-skip", new BytesRef(s)));
terms.add(s);
}
writer.addDocument(doc);
@ -89,8 +89,7 @@ public class TestDocValuesRewriteMethod extends LuceneTestCase {
}
reader = writer.getReader();
searcher1 = newSearcher(reader);
searcher2 = newSearcher(reader);
searcher = newSearcher(reader);
writer.close();
}
@ -123,12 +122,22 @@ public class TestDocValuesRewriteMethod extends LuceneTestCase {
name -> null,
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
new DocValuesRewriteMethod());
RegexpQuery docValuesWithSkip =
new RegexpQuery(
new Term(fieldName + "_with-skip", regexp),
RegExp.NONE,
0,
name -> null,
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
new DocValuesRewriteMethod());
RegexpQuery inverted = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
TopDocs invertedDocs = searcher1.search(inverted, 25);
TopDocs docValuesDocs = searcher2.search(docValues, 25);
TopDocs invertedDocs = searcher.search(inverted, 25);
TopDocs docValuesDocs = searcher.search(docValues, 25);
TopDocs docValuesWithSkipDocs = searcher.search(docValuesWithSkip, 25);
CheckHits.checkEqual(inverted, invertedDocs.scoreDocs, docValuesDocs.scoreDocs);
CheckHits.checkEqual(inverted, invertedDocs.scoreDocs, docValuesWithSkipDocs.scoreDocs);
}
public void testEquals() throws Exception {

View File

@ -31,6 +31,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.KeywordField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterDirectoryReader;
@ -119,11 +120,14 @@ public class TestTermInSetQuery extends LuceneTestCase {
}
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
final int numDocs = atLeast(100);
final int numDocs = atLeast(10_000);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
final BytesRef term = allTerms.get(random().nextInt(allTerms.size()));
doc.add(new StringField(field, term, Store.NO));
// Also include a doc values field with a skip-list so we can test doc-value rewrite as
// well:
doc.add(SortedSetDocValuesField.indexedField(field, term));
iw.addDocument(doc);
}
if (numTerms > 1 && random().nextBoolean()) {
@ -154,7 +158,9 @@ public class TestTermInSetQuery extends LuceneTestCase {
}
final Query q1 = new ConstantScoreQuery(bq.build());
final Query q2 = new TermInSetQuery(field, queryTerms);
final Query q3 = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, queryTerms);
assertSameMatches(searcher, new BoostQuery(q1, boost), new BoostQuery(q2, boost), true);
assertSameMatches(searcher, new BoostQuery(q1, boost), new BoostQuery(q3, boost), false);
}
reader.close();
@ -225,6 +231,53 @@ public class TestTermInSetQuery extends LuceneTestCase {
}
}
/**
* Make sure the doc values skipper isn't making the incorrect assumption that the min/max terms
* from a TermInSetQuery don't form a continuous range.
*/
public void testSkipperOptimizationGapAssumption() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
// Index the first 10,000 docs all with the term "b" to get some skip list blocks with the range
// [b, b]:
for (int i = 0; i < 10_000; i++) {
Document doc = new Document();
BytesRef term = new BytesRef("b");
doc.add(new SortedSetDocValuesField("field", term));
doc.add(SortedSetDocValuesField.indexedField("idx_field", term));
iw.addDocument(doc);
}
// Index a couple more docs with terms "a" and "c":
Document doc = new Document();
BytesRef term = new BytesRef("a");
doc.add(new SortedSetDocValuesField("field", term));
doc.add(SortedSetDocValuesField.indexedField("idx_field", term));
iw.addDocument(doc);
doc = new Document();
term = new BytesRef("c");
doc.add(new SortedSetDocValuesField("field", term));
doc.add(SortedSetDocValuesField.indexedField("idx_field", term));
iw.addDocument(doc);
iw.commit();
IndexReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
// Our query is for (or "a" "c") which should use a skip-list optimization to exclude blocks of
// documents that fall outside the range [a, c]. We want to test that they don't incorrectly do
// the inverse and include all docs in a block that fall within [a, c] (which is why we have
// blocks of only "b" docs up-front):
List<BytesRef> queryTerms = List.of(new BytesRef("a"), new BytesRef("c"));
Query q1 = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, "field", queryTerms);
Query q2 = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, "idx_field", queryTerms);
assertSameMatches(searcher, q1, q2, false);
reader.close();
dir.close();
}
private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boolean scores)
throws IOException {
final int maxDoc = searcher.getIndexReader().maxDoc();