mirror of https://github.com/apache/lucene.git
Speed up docvalues set query by making use of sortedness (#12128)
LongHashSet is used for the set of numbers, but it has some issues: * tries to hard to extend AbstractSet, mostly for testing * causes traps with boxing if you aren't careful * complex hashcode/equals Practically we should take advantage of the fact numbers come in sorted order for multivalued fields: just like range queries do. So we use min/max to our advantage, including termination of docvalues iteration Actually it is generally a win to just check min/max even in the single-valued case: these constant time comparisons are cheap and can avoid hashing, etc. In the worst-case, if all of your query Sets contain both the minimum and maximum possible values, then it won't help, but it doesn't hurt either.
This commit is contained in:
parent
a6bceb7cf0
commit
10d9c7440b
|
@ -16,15 +16,16 @@
|
|||
*/
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import java.util.AbstractSet;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
final class LongHashSet extends AbstractSet<Long> implements Accountable {
|
||||
/** Set of longs, optimized for docvalues usage */
|
||||
final class LongHashSet implements Accountable {
|
||||
private static final long BASE_RAM_BYTES =
|
||||
RamUsageEstimator.shallowSizeOfInstance(LongHashSet.class);
|
||||
|
||||
|
@ -34,8 +35,12 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
|
|||
final int mask;
|
||||
final boolean hasMissingValue;
|
||||
final int size;
|
||||
final int hashCode;
|
||||
/** minimum value in the set, or Long.MAX_VALUE for an empty set */
|
||||
final long minValue;
|
||||
/** maximum value in the set, or Long.MIN_VALUE for an empty set */
|
||||
final long maxValue;
|
||||
|
||||
/** Construct a set. Values must be in sorted order. */
|
||||
LongHashSet(long[] values) {
|
||||
int tableSize = Math.toIntExact(values.length * 3L / 2);
|
||||
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
|
||||
|
@ -45,19 +50,21 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
|
|||
mask = tableSize - 1;
|
||||
boolean hasMissingValue = false;
|
||||
int size = 0;
|
||||
int hashCode = 0;
|
||||
long previousValue = Long.MIN_VALUE; // for assert
|
||||
for (long value : values) {
|
||||
if (value == MISSING || add(value)) {
|
||||
if (value == MISSING) {
|
||||
hasMissingValue = true;
|
||||
}
|
||||
++size;
|
||||
hashCode += Long.hashCode(value);
|
||||
}
|
||||
assert value >= previousValue : "values must be provided in sorted order";
|
||||
previousValue = value;
|
||||
}
|
||||
this.hasMissingValue = hasMissingValue;
|
||||
this.size = size;
|
||||
this.hashCode = hashCode;
|
||||
this.minValue = values.length == 0 ? Long.MAX_VALUE : values[0];
|
||||
this.maxValue = values.length == 0 ? Long.MIN_VALUE : values[values.length - 1];
|
||||
}
|
||||
|
||||
private boolean add(long l) {
|
||||
|
@ -74,6 +81,12 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* check for membership in the set.
|
||||
*
|
||||
* <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
|
||||
* calling this.
|
||||
*/
|
||||
boolean contains(long l) {
|
||||
if (l == MISSING) {
|
||||
return hasMissingValue;
|
||||
|
@ -88,33 +101,49 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return hashCode;
|
||||
return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj != null && obj.getClass() == LongHashSet.class) {
|
||||
if (obj != null && obj instanceof LongHashSet) {
|
||||
LongHashSet that = (LongHashSet) obj;
|
||||
if (hashCode != that.hashCode
|
||||
|| size != that.size
|
||||
|| hasMissingValue != that.hasMissingValue) {
|
||||
return false;
|
||||
}
|
||||
for (long v : table) {
|
||||
if (v != MISSING && that.contains(v) == false) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return size == that.size
|
||||
&& minValue == that.minValue
|
||||
&& maxValue == that.maxValue
|
||||
&& mask == that.mask
|
||||
&& hasMissingValue == that.hasMissingValue
|
||||
&& Arrays.equals(table, that.table);
|
||||
}
|
||||
return super.equals(obj);
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("[");
|
||||
boolean seenValue = false;
|
||||
if (hasMissingValue) {
|
||||
sb.append(MISSING);
|
||||
seenValue = true;
|
||||
}
|
||||
for (long v : table) {
|
||||
if (v != MISSING) {
|
||||
if (seenValue) {
|
||||
sb.append(", ");
|
||||
}
|
||||
sb.append(v);
|
||||
seenValue = true;
|
||||
}
|
||||
}
|
||||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/** number of elements in the set */
|
||||
int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -122,41 +151,17 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
|
|||
return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(Object o) {
|
||||
return o instanceof Long && contains(((Long) o).longValue());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Long> iterator() {
|
||||
return new Iterator<Long>() {
|
||||
|
||||
private boolean hasNext = hasMissingValue;
|
||||
private int i = -1;
|
||||
private long value = MISSING;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (hasNext) {
|
||||
return true;
|
||||
}
|
||||
while (++i < table.length) {
|
||||
value = table[i];
|
||||
if (value != MISSING) {
|
||||
return hasNext = true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
// for testing only
|
||||
Set<Long> toSet() {
|
||||
Set<Long> set = new HashSet<>();
|
||||
if (hasMissingValue) {
|
||||
set.add(MISSING);
|
||||
}
|
||||
for (long v : table) {
|
||||
if (v != MISSING) {
|
||||
set.add(v);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long next() {
|
||||
if (hasNext() == false) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
hasNext = false;
|
||||
return value;
|
||||
}
|
||||
};
|
||||
}
|
||||
return set;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -45,6 +46,7 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
|
|||
|
||||
SortedNumericDocValuesSetQuery(String field, long[] numbers) {
|
||||
this.field = Objects.requireNonNull(field);
|
||||
Arrays.sort(numbers);
|
||||
this.numbers = new LongHashSet(numbers);
|
||||
}
|
||||
|
||||
|
@ -112,12 +114,15 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
|
|||
new TwoPhaseIterator(singleton) {
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
return numbers.contains(singleton.longValue());
|
||||
long value = singleton.longValue();
|
||||
return value >= numbers.minValue
|
||||
&& value <= numbers.maxValue
|
||||
&& numbers.contains(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return 5; // lookup in the set
|
||||
return 5; // 2 comparisions, possible lookup in the set
|
||||
}
|
||||
};
|
||||
} else {
|
||||
|
@ -127,7 +132,12 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
|
|||
public boolean matches() throws IOException {
|
||||
int count = values.docValueCount();
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (numbers.contains(values.nextValue())) {
|
||||
final long value = values.nextValue();
|
||||
if (value < numbers.minValue) {
|
||||
continue;
|
||||
} else if (value > numbers.maxValue) {
|
||||
return false; // values are sorted, terminate
|
||||
} else if (numbers.contains(value)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -136,7 +146,7 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
|
|||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return 5; // lookup in the set
|
||||
return 5; // 2 comparisons, possible lookup in the set
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -25,11 +25,10 @@ import org.apache.lucene.tests.util.LuceneTestCase;
|
|||
|
||||
public class TestLongHashSet extends LuceneTestCase {
|
||||
|
||||
private void assertEquals(Set<Long> set1, LongHashSet set2) {
|
||||
private void assertEquals(Set<Long> set1, LongHashSet longHashSet) {
|
||||
Set<Long> set2 = longHashSet.toSet();
|
||||
|
||||
LuceneTestCase.assertEquals(set1, set2);
|
||||
LuceneTestCase.assertEquals(set2, set1);
|
||||
LuceneTestCase.assertEquals(set2, set2);
|
||||
assertEquals(set1.hashCode(), set2.hashCode());
|
||||
|
||||
if (set1.isEmpty() == false) {
|
||||
Set<Long> set3 = new HashSet<>(set1);
|
||||
|
@ -40,40 +39,53 @@ public class TestLongHashSet extends LuceneTestCase {
|
|||
break;
|
||||
}
|
||||
}
|
||||
assertNotEquals(set3, set2);
|
||||
assertNotEquals(set3, longHashSet);
|
||||
}
|
||||
}
|
||||
|
||||
private void assertNotEquals(Set<Long> set1, LongHashSet set2) {
|
||||
assertFalse(set1.equals(set2));
|
||||
assertFalse(set2.equals(set1));
|
||||
LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).toArray());
|
||||
assertFalse(set2.equals(set3));
|
||||
private void assertNotEquals(Set<Long> set1, LongHashSet longHashSet) {
|
||||
Set<Long> set2 = longHashSet.toSet();
|
||||
|
||||
LuceneTestCase.assertNotEquals(set1, set2);
|
||||
|
||||
LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).sorted().toArray());
|
||||
|
||||
LuceneTestCase.assertNotEquals(set2, set3.toSet());
|
||||
}
|
||||
|
||||
public void testEmpty() {
|
||||
Set<Long> set1 = new HashSet<>();
|
||||
LongHashSet set2 = new LongHashSet(new long[] {});
|
||||
assertEquals(Long.MAX_VALUE, set2.minValue);
|
||||
assertEquals(Long.MIN_VALUE, set2.maxValue);
|
||||
assertEquals(set1, set2);
|
||||
}
|
||||
|
||||
public void testOneValue() {
|
||||
Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
|
||||
LongHashSet set2 = new LongHashSet(new long[] {42L});
|
||||
assertEquals(42L, set2.minValue);
|
||||
assertEquals(42L, set2.maxValue);
|
||||
assertEquals(set1, set2);
|
||||
|
||||
set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE));
|
||||
set2 = new LongHashSet(new long[] {Long.MIN_VALUE});
|
||||
assertEquals(Long.MIN_VALUE, set2.minValue);
|
||||
assertEquals(Long.MIN_VALUE, set2.maxValue);
|
||||
assertEquals(set1, set2);
|
||||
}
|
||||
|
||||
public void testTwoValues() {
|
||||
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Long.MAX_VALUE));
|
||||
LongHashSet set2 = new LongHashSet(new long[] {42L, Long.MAX_VALUE});
|
||||
assertEquals(42, set2.minValue);
|
||||
assertEquals(Long.MAX_VALUE, set2.maxValue);
|
||||
assertEquals(set1, set2);
|
||||
|
||||
set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE, 42L));
|
||||
set2 = new LongHashSet(new long[] {Long.MIN_VALUE, 42L});
|
||||
assertEquals(Long.MIN_VALUE, set2.minValue);
|
||||
assertEquals(42, set2.maxValue);
|
||||
assertEquals(set1, set2);
|
||||
}
|
||||
|
||||
|
@ -95,6 +107,7 @@ public class TestLongHashSet extends LuceneTestCase {
|
|||
LongStream.of(values)
|
||||
.mapToObj(Long::valueOf)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
Arrays.sort(values);
|
||||
LongHashSet set2 = new LongHashSet(values);
|
||||
assertEquals(set1, set2);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue