LUCENE-9613: Encode ordinals like numerics. (#186)

This helps simplify the code, and also adds some optimizations to ordinals like
better compression for long runs of equal values or fields that are used in
index sorts.
This commit is contained in:
Adrien Grand 2021-06-23 15:37:50 +02:00 committed by GitHub
parent 495bf6730f
commit 1d5d458960
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 197 additions and 348 deletions

View File

@ -34,6 +34,7 @@ import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.EmptyDocValuesProducer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
@ -49,6 +50,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.compress.LZ4;
@ -466,54 +468,47 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer)
throws IOException {
SortedDocValues values = valuesProducer.getSorted(field);
int numDocsWithField = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
numDocsWithField++;
}
writeValues(
field,
new EmptyDocValuesProducer() {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedDocValues sorted = valuesProducer.getSorted(field);
NumericDocValues sortedOrds =
new NumericDocValues() {
@Override
public long longValue() throws IOException {
return sorted.ordValue();
}
if (numDocsWithField == 0) {
meta.writeLong(-2); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getSorted(field);
final short jumpTableentryCount =
IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableentryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
@Override
public boolean advanceExact(int target) throws IOException {
return sorted.advanceExact(target);
}
meta.writeInt(numDocsWithField);
if (values.getValueCount() <= 1) {
meta.writeByte((byte) 0); // bitsPerValue
meta.writeLong(0L); // ordsOffset
meta.writeLong(0L); // ordsLength
} else {
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
long start = data.getFilePointer();
meta.writeLong(start); // ordsOffset
DirectWriter writer = DirectWriter.getInstance(data, numDocsWithField, numberOfBitsPerOrd);
values = valuesProducer.getSorted(field);
for (int doc = values.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = values.nextDoc()) {
writer.add(values.ordValue());
}
writer.finish();
meta.writeLong(data.getFilePointer() - start); // ordsLength
}
@Override
public int docID() {
return sorted.docID();
}
@Override
public int nextDoc() throws IOException {
return sorted.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return sorted.advance(target);
}
@Override
public long cost() {
return sorted.cost();
}
};
return DocValues.singleton(sortedOrds);
}
});
addTermsDict(DocValues.singleton(valuesProducer.getSorted(field)));
}
@ -669,7 +664,11 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene90DocValuesFormat.SORTED_NUMERIC);
doAddSortedNumericField(field, valuesProducer);
}
private void doAddSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer)
throws IOException {
long[] stats = writeValues(field, valuesProducer);
int numDocsWithField = Math.toIntExact(stats[0]);
long numValues = stats[1];
@ -731,60 +730,65 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
}
meta.writeByte((byte) 1); // multiValued (1 = multiValued)
assert numDocsWithField != 0;
if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getSortedSet(field);
final short jumpTableEntryCount =
IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
doAddSortedNumericField(
field,
new EmptyDocValuesProducer() {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedSetDocValues values = valuesProducer.getSortedSet(field);
return new SortedNumericDocValues() {
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
long start = data.getFilePointer();
meta.writeLong(start); // ordsOffset
DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd);
values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
for (long ord = values.nextOrd();
ord != SortedSetDocValues.NO_MORE_ORDS;
ord = values.nextOrd()) {
writer.add(ord);
}
}
writer.finish();
meta.writeLong(data.getFilePointer() - start); // ordsLength
long[] ords = LongsRef.EMPTY_LONGS;
int i, docValueCount;
meta.writeInt(numDocsWithField);
start = data.getFilePointer();
meta.writeLong(start); // addressesOffset
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
@Override
public long nextValue() throws IOException {
return ords[i++];
}
final DirectMonotonicWriter addressesWriter =
DirectMonotonicWriter.getInstance(
meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
long addr = 0;
addressesWriter.add(addr);
values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
values.nextOrd();
addr++;
while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
addr++;
}
addressesWriter.add(addr);
}
addressesWriter.finish();
meta.writeLong(data.getFilePointer() - start); // addressesLength
@Override
public int docValueCount() {
return docValueCount;
}
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return values.docID();
}
@Override
public int nextDoc() throws IOException {
int doc = values.nextDoc();
if (doc != NO_MORE_DOCS) {
docValueCount = 0;
for (long ord = values.nextOrd();
ord != SortedSetDocValues.NO_MORE_ORDS;
ord = values.nextOrd()) {
ords = ArrayUtil.grow(ords, docValueCount + 1);
ords[docValueCount++] = ord;
}
i = 0;
}
return doc;
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return values.cost();
}
};
}
});
addTermsDict(values);
}

View File

@ -52,7 +52,7 @@ import org.apache.lucene.util.packed.DirectWriter;
* by accumulating the {@link Long#bitCount(long) bit counts} of the visited longs. Advancing
* &gt;= 512 documents is performed by skipping to the start of the needed 512 document
* sub-block and iterating to the specific document within that block. The index for the
* sub-block that is skipped to is retrieved from a rank-table positioned beforethe bit set.
* sub-block that is skipped to is retrieved from a rank-table positioned before the bit set.
* The rank-table holds the origo index numbers for all 512 documents sub-blocks, represented
* as an unsigned short for each 128 blocks.
* <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that the

View File

@ -213,15 +213,10 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
private SortedEntry readSorted(IndexInput meta) throws IOException {
SortedEntry entry = new SortedEntry();
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
entry.jumpTableEntryCount = meta.readShort();
entry.denseRankPower = meta.readByte();
entry.numDocsWithField = meta.readInt();
entry.bitsPerValue = meta.readByte();
entry.ordsOffset = meta.readLong();
entry.ordsLength = meta.readLong();
readTermDict(meta, entry);
entry.ordsEntry = new NumericEntry();
readNumeric(meta, entry.ordsEntry);
entry.termsDictEntry = new TermsDictEntry();
readTermDict(meta, entry.termsDictEntry);
return entry;
}
@ -237,20 +232,10 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
default:
throw new CorruptIndexException("Invalid multiValued flag: " + multiValued, meta);
}
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
entry.jumpTableEntryCount = meta.readShort();
entry.denseRankPower = meta.readByte();
entry.bitsPerValue = meta.readByte();
entry.ordsOffset = meta.readLong();
entry.ordsLength = meta.readLong();
entry.numDocsWithField = meta.readInt();
entry.addressesOffset = meta.readLong();
final int blockShift = meta.readVInt();
entry.addressesMeta =
DirectMonotonicReader.loadMeta(meta, entry.numDocsWithField + 1, blockShift);
entry.addressesLength = meta.readLong();
readTermDict(meta, entry);
entry.ordsEntry = new SortedNumericEntry();
readSortedNumeric(meta, entry.ordsEntry);
entry.termsDictEntry = new TermsDictEntry();
readTermDict(meta, entry.termsDictEntry);
return entry;
}
@ -279,6 +264,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
private SortedNumericEntry readSortedNumeric(IndexInput meta) throws IOException {
SortedNumericEntry entry = new SortedNumericEntry();
readSortedNumeric(meta, entry);
return entry;
}
private SortedNumericEntry readSortedNumeric(IndexInput meta, SortedNumericEntry entry)
throws IOException {
readNumeric(meta, entry);
entry.numDocsWithField = meta.readInt();
if (entry.numDocsWithField != entry.numValues) {
@ -345,30 +336,15 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
int maxBlockLength;
}
private static class SortedEntry extends TermsDictEntry {
long docsWithFieldOffset;
long docsWithFieldLength;
short jumpTableEntryCount;
byte denseRankPower;
int numDocsWithField;
byte bitsPerValue;
long ordsOffset;
long ordsLength;
private static class SortedEntry {
NumericEntry ordsEntry;
TermsDictEntry termsDictEntry;
}
private static class SortedSetEntry extends TermsDictEntry {
private static class SortedSetEntry {
SortedEntry singleValueEntry;
long docsWithFieldOffset;
long docsWithFieldLength;
short jumpTableEntryCount;
byte denseRankPower;
int numDocsWithField;
byte bitsPerValue;
long ordsOffset;
long ordsLength;
DirectMonotonicReader.Meta addressesMeta;
long addressesOffset;
long addressesLength;
SortedNumericEntry ordsEntry;
TermsDictEntry termsDictEntry;
}
private static class SortedNumericEntry extends NumericEntry {
@ -789,107 +765,39 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
}
private SortedDocValues getSorted(SortedEntry entry) throws IOException {
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptySorted();
}
final NumericDocValues ords = getNumeric(entry.ordsEntry);
return new BaseSortedDocValues(entry, data) {
final LongValues ords;
if (entry.bitsPerValue == 0) {
ords =
new LongValues() {
@Override
public long get(long index) {
return 0L;
}
};
} else {
final RandomAccessInput slice = data.randomAccessSlice(entry.ordsOffset, entry.ordsLength);
ords = DirectReader.getInstance(slice, entry.bitsPerValue);
}
@Override
public int ordValue() throws IOException {
return (int) ords.longValue();
}
if (entry.docsWithFieldOffset == -1) {
// dense
return new BaseSortedDocValues(entry, data) {
@Override
public boolean advanceExact(int target) throws IOException {
return ords.advanceExact(target);
}
int doc = -1;
@Override
public int docID() {
return ords.docID();
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int nextDoc() throws IOException {
return ords.nextDoc();
}
@Override
public int docID() {
return doc;
}
@Override
public int advance(int target) throws IOException {
return ords.advance(target);
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
if (target >= maxDoc) {
return doc = NO_MORE_DOCS;
}
return doc = target;
}
@Override
public boolean advanceExact(int target) {
doc = target;
return true;
}
@Override
public int ordValue() {
return (int) ords.get(doc);
}
};
} else {
// sparse
final IndexedDISI disi =
new IndexedDISI(
data,
entry.docsWithFieldOffset,
entry.docsWithFieldLength,
entry.jumpTableEntryCount,
entry.denseRankPower,
entry.numDocsWithField);
return new BaseSortedDocValues(entry, data) {
@Override
public int nextDoc() throws IOException {
return disi.nextDoc();
}
@Override
public int docID() {
return disi.docID();
}
@Override
public long cost() {
return disi.cost();
}
@Override
public int advance(int target) throws IOException {
return disi.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
return disi.advanceExact(target);
}
@Override
public int ordValue() {
return (int) ords.get(disi.index());
}
};
}
@Override
public long cost() {
return ords.cost();
}
};
}
private abstract static class BaseSortedDocValues extends SortedDocValues {
@ -906,7 +814,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public int getValueCount() {
return Math.toIntExact(entry.termsDictSize);
return Math.toIntExact(entry.termsDictEntry.termsDictSize);
}
@Override
@ -930,7 +838,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public TermsEnum termsEnum() throws IOException {
return new TermsDict(entry, data);
return new TermsDict(entry.termsDictEntry, data);
}
}
@ -948,7 +856,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public long getValueCount() {
return entry.termsDictSize;
return entry.termsDictEntry.termsDictSize;
}
@Override
@ -972,7 +880,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public TermsEnum termsEnum() throws IOException {
return new TermsDict(entry, data);
return new TermsDict(entry.termsDictEntry, data);
}
}
@ -1204,6 +1112,10 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedNumericEntry entry = sortedNumerics.get(field.name);
return getSortedNumeric(entry);
}
private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry) throws IOException {
if (entry.numValues == entry.numDocsWithField) {
return DocValues.singleton(getNumeric(entry));
}
@ -1344,124 +1256,57 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
return DocValues.singleton(getSorted(entry.singleValueEntry));
}
final RandomAccessInput slice = data.randomAccessSlice(entry.ordsOffset, entry.ordsLength);
final LongValues ords = DirectReader.getInstance(slice, entry.bitsPerValue);
final SortedNumericDocValues ords = getSortedNumeric(entry.ordsEntry);
return new BaseSortedSetDocValues(entry, data) {
final RandomAccessInput addressesInput =
data.randomAccessSlice(entry.addressesOffset, entry.addressesLength);
final LongValues addresses =
DirectMonotonicReader.getInstance(entry.addressesMeta, addressesInput);
int i = 0;
int count = 0;
if (entry.docsWithFieldOffset == -1) {
// dense
return new BaseSortedSetDocValues(entry, data) {
int doc = -1;
long start;
long end;
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
@Override
public long nextOrd() throws IOException {
if (i++ == count) {
return NO_MORE_ORDS;
}
return ords.nextValue();
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
if (target >= maxDoc) {
return doc = NO_MORE_DOCS;
}
start = addresses.get(target);
end = addresses.get(target + 1L);
return doc = target;
}
@Override
public boolean advanceExact(int target) throws IOException {
start = addresses.get(target);
end = addresses.get(target + 1L);
doc = target;
@Override
public boolean advanceExact(int target) throws IOException {
if (ords.advanceExact(target)) {
count = ords.docValueCount();
i = 0;
return true;
} else {
return false;
}
}
@Override
public long nextOrd() throws IOException {
if (start == end) {
return NO_MORE_ORDS;
}
return ords.get(start++);
}
};
} else {
// sparse
final IndexedDISI disi =
new IndexedDISI(
data,
entry.docsWithFieldOffset,
entry.docsWithFieldLength,
entry.jumpTableEntryCount,
entry.denseRankPower,
entry.numDocsWithField);
return new BaseSortedSetDocValues(entry, data) {
@Override
public int docID() {
return ords.docID();
}
boolean set;
long start;
long end = 0;
@Override
public int nextDoc() throws IOException {
int doc = ords.nextDoc();
count = ords.docValueCount();
i = 0;
return doc;
}
@Override
public int nextDoc() throws IOException {
set = false;
return disi.nextDoc();
}
@Override
public int advance(int target) throws IOException {
int doc = ords.advance(target);
count = ords.docValueCount();
i = 0;
return doc;
}
@Override
public int docID() {
return disi.docID();
}
@Override
public long cost() {
return disi.cost();
}
@Override
public int advance(int target) throws IOException {
set = false;
return disi.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
set = false;
return disi.advanceExact(target);
}
@Override
public long nextOrd() throws IOException {
if (set == false) {
final int index = disi.index();
final long start = addresses.get(index);
this.start = start + 1;
end = addresses.get(index + 1L);
set = true;
return ords.get(start);
} else if (start == end) {
return NO_MORE_ORDS;
} else {
return ords.get(start++);
}
}
};
}
@Override
public long cost() {
return ords.cost();
}
};
}
@Override