LUCENE-9613: Encode ordinals like numerics. (#186)

This helps simplify the code, and also adds some optimizations to ordinals like
better compression for long runs of equal values or fields that are used in
index sorts.
This commit is contained in:
Adrien Grand 2021-06-23 15:37:50 +02:00 committed by GitHub
parent 495bf6730f
commit 1d5d458960
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 197 additions and 348 deletions

View File

@ -34,6 +34,7 @@ import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.EmptyDocValuesProducer;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedNumericDocValues;
@ -49,6 +50,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.MathUtil; import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.compress.LZ4; import org.apache.lucene.util.compress.LZ4;
@ -466,54 +468,47 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer) private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer)
throws IOException { throws IOException {
SortedDocValues values = valuesProducer.getSorted(field); writeValues(
int numDocsWithField = 0; field,
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { new EmptyDocValuesProducer() {
numDocsWithField++; @Override
} public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedDocValues sorted = valuesProducer.getSorted(field);
NumericDocValues sortedOrds =
new NumericDocValues() {
@Override
public long longValue() throws IOException {
return sorted.ordValue();
}
if (numDocsWithField == 0) { @Override
meta.writeLong(-2); // docsWithFieldOffset public boolean advanceExact(int target) throws IOException {
meta.writeLong(0L); // docsWithFieldLength return sorted.advanceExact(target);
meta.writeShort((short) -1); // jumpTableEntryCount }
meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getSorted(field);
final short jumpTableentryCount =
IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableentryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
meta.writeInt(numDocsWithField); @Override
if (values.getValueCount() <= 1) { public int docID() {
meta.writeByte((byte) 0); // bitsPerValue return sorted.docID();
meta.writeLong(0L); // ordsOffset }
meta.writeLong(0L); // ordsLength
} else {
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
long start = data.getFilePointer();
meta.writeLong(start); // ordsOffset
DirectWriter writer = DirectWriter.getInstance(data, numDocsWithField, numberOfBitsPerOrd);
values = valuesProducer.getSorted(field);
for (int doc = values.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = values.nextDoc()) {
writer.add(values.ordValue());
}
writer.finish();
meta.writeLong(data.getFilePointer() - start); // ordsLength
}
@Override
public int nextDoc() throws IOException {
return sorted.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return sorted.advance(target);
}
@Override
public long cost() {
return sorted.cost();
}
};
return DocValues.singleton(sortedOrds);
}
});
addTermsDict(DocValues.singleton(valuesProducer.getSorted(field))); addTermsDict(DocValues.singleton(valuesProducer.getSorted(field)));
} }
@ -669,7 +664,11 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
throws IOException { throws IOException {
meta.writeInt(field.number); meta.writeInt(field.number);
meta.writeByte(Lucene90DocValuesFormat.SORTED_NUMERIC); meta.writeByte(Lucene90DocValuesFormat.SORTED_NUMERIC);
doAddSortedNumericField(field, valuesProducer);
}
private void doAddSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer)
throws IOException {
long[] stats = writeValues(field, valuesProducer); long[] stats = writeValues(field, valuesProducer);
int numDocsWithField = Math.toIntExact(stats[0]); int numDocsWithField = Math.toIntExact(stats[0]);
long numValues = stats[1]; long numValues = stats[1];
@ -731,60 +730,65 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
} }
meta.writeByte((byte) 1); // multiValued (1 = multiValued) meta.writeByte((byte) 1); // multiValued (1 = multiValued)
assert numDocsWithField != 0; doAddSortedNumericField(
if (numDocsWithField == maxDoc) { field,
meta.writeLong(-1); // docsWithFieldOffset new EmptyDocValuesProducer() {
meta.writeLong(0L); // docsWithFieldLength @Override
meta.writeShort((short) -1); // jumpTableEntryCount public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
meta.writeByte((byte) -1); // denseRankPower SortedSetDocValues values = valuesProducer.getSortedSet(field);
} else { return new SortedNumericDocValues() {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getSortedSet(field);
final short jumpTableEntryCount =
IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1); long[] ords = LongsRef.EMPTY_LONGS;
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue int i, docValueCount;
long start = data.getFilePointer();
meta.writeLong(start); // ordsOffset
DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd);
values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
for (long ord = values.nextOrd();
ord != SortedSetDocValues.NO_MORE_ORDS;
ord = values.nextOrd()) {
writer.add(ord);
}
}
writer.finish();
meta.writeLong(data.getFilePointer() - start); // ordsLength
meta.writeInt(numDocsWithField); @Override
start = data.getFilePointer(); public long nextValue() throws IOException {
meta.writeLong(start); // addressesOffset return ords[i++];
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); }
final DirectMonotonicWriter addressesWriter = @Override
DirectMonotonicWriter.getInstance( public int docValueCount() {
meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT); return docValueCount;
long addr = 0; }
addressesWriter.add(addr);
values = valuesProducer.getSortedSet(field); @Override
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { public boolean advanceExact(int target) throws IOException {
values.nextOrd(); throw new UnsupportedOperationException();
addr++; }
while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
addr++; @Override
} public int docID() {
addressesWriter.add(addr); return values.docID();
} }
addressesWriter.finish();
meta.writeLong(data.getFilePointer() - start); // addressesLength @Override
public int nextDoc() throws IOException {
int doc = values.nextDoc();
if (doc != NO_MORE_DOCS) {
docValueCount = 0;
for (long ord = values.nextOrd();
ord != SortedSetDocValues.NO_MORE_ORDS;
ord = values.nextOrd()) {
ords = ArrayUtil.grow(ords, docValueCount + 1);
ords[docValueCount++] = ord;
}
i = 0;
}
return doc;
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return values.cost();
}
};
}
});
addTermsDict(values); addTermsDict(values);
} }

View File

@ -52,7 +52,7 @@ import org.apache.lucene.util.packed.DirectWriter;
* by accumulating the {@link Long#bitCount(long) bit counts} of the visited longs. Advancing * by accumulating the {@link Long#bitCount(long) bit counts} of the visited longs. Advancing
* &gt;= 512 documents is performed by skipping to the start of the needed 512 document * &gt;= 512 documents is performed by skipping to the start of the needed 512 document
* sub-block and iterating to the specific document within that block. The index for the * sub-block and iterating to the specific document within that block. The index for the
* sub-block that is skipped to is retrieved from a rank-table positioned beforethe bit set. * sub-block that is skipped to is retrieved from a rank-table positioned before the bit set.
* The rank-table holds the origo index numbers for all 512 documents sub-blocks, represented * The rank-table holds the origo index numbers for all 512 documents sub-blocks, represented
* as an unsigned short for each 128 blocks. * as an unsigned short for each 128 blocks.
* <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that the * <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that the

View File

@ -213,15 +213,10 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
private SortedEntry readSorted(IndexInput meta) throws IOException { private SortedEntry readSorted(IndexInput meta) throws IOException {
SortedEntry entry = new SortedEntry(); SortedEntry entry = new SortedEntry();
entry.docsWithFieldOffset = meta.readLong(); entry.ordsEntry = new NumericEntry();
entry.docsWithFieldLength = meta.readLong(); readNumeric(meta, entry.ordsEntry);
entry.jumpTableEntryCount = meta.readShort(); entry.termsDictEntry = new TermsDictEntry();
entry.denseRankPower = meta.readByte(); readTermDict(meta, entry.termsDictEntry);
entry.numDocsWithField = meta.readInt();
entry.bitsPerValue = meta.readByte();
entry.ordsOffset = meta.readLong();
entry.ordsLength = meta.readLong();
readTermDict(meta, entry);
return entry; return entry;
} }
@ -237,20 +232,10 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
default: default:
throw new CorruptIndexException("Invalid multiValued flag: " + multiValued, meta); throw new CorruptIndexException("Invalid multiValued flag: " + multiValued, meta);
} }
entry.docsWithFieldOffset = meta.readLong(); entry.ordsEntry = new SortedNumericEntry();
entry.docsWithFieldLength = meta.readLong(); readSortedNumeric(meta, entry.ordsEntry);
entry.jumpTableEntryCount = meta.readShort(); entry.termsDictEntry = new TermsDictEntry();
entry.denseRankPower = meta.readByte(); readTermDict(meta, entry.termsDictEntry);
entry.bitsPerValue = meta.readByte();
entry.ordsOffset = meta.readLong();
entry.ordsLength = meta.readLong();
entry.numDocsWithField = meta.readInt();
entry.addressesOffset = meta.readLong();
final int blockShift = meta.readVInt();
entry.addressesMeta =
DirectMonotonicReader.loadMeta(meta, entry.numDocsWithField + 1, blockShift);
entry.addressesLength = meta.readLong();
readTermDict(meta, entry);
return entry; return entry;
} }
@ -279,6 +264,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
private SortedNumericEntry readSortedNumeric(IndexInput meta) throws IOException { private SortedNumericEntry readSortedNumeric(IndexInput meta) throws IOException {
SortedNumericEntry entry = new SortedNumericEntry(); SortedNumericEntry entry = new SortedNumericEntry();
readSortedNumeric(meta, entry);
return entry;
}
private SortedNumericEntry readSortedNumeric(IndexInput meta, SortedNumericEntry entry)
throws IOException {
readNumeric(meta, entry); readNumeric(meta, entry);
entry.numDocsWithField = meta.readInt(); entry.numDocsWithField = meta.readInt();
if (entry.numDocsWithField != entry.numValues) { if (entry.numDocsWithField != entry.numValues) {
@ -345,30 +336,15 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
int maxBlockLength; int maxBlockLength;
} }
private static class SortedEntry extends TermsDictEntry { private static class SortedEntry {
long docsWithFieldOffset; NumericEntry ordsEntry;
long docsWithFieldLength; TermsDictEntry termsDictEntry;
short jumpTableEntryCount;
byte denseRankPower;
int numDocsWithField;
byte bitsPerValue;
long ordsOffset;
long ordsLength;
} }
private static class SortedSetEntry extends TermsDictEntry { private static class SortedSetEntry {
SortedEntry singleValueEntry; SortedEntry singleValueEntry;
long docsWithFieldOffset; SortedNumericEntry ordsEntry;
long docsWithFieldLength; TermsDictEntry termsDictEntry;
short jumpTableEntryCount;
byte denseRankPower;
int numDocsWithField;
byte bitsPerValue;
long ordsOffset;
long ordsLength;
DirectMonotonicReader.Meta addressesMeta;
long addressesOffset;
long addressesLength;
} }
private static class SortedNumericEntry extends NumericEntry { private static class SortedNumericEntry extends NumericEntry {
@ -789,107 +765,39 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
} }
private SortedDocValues getSorted(SortedEntry entry) throws IOException { private SortedDocValues getSorted(SortedEntry entry) throws IOException {
if (entry.docsWithFieldOffset == -2) { final NumericDocValues ords = getNumeric(entry.ordsEntry);
return DocValues.emptySorted(); return new BaseSortedDocValues(entry, data) {
}
final LongValues ords; @Override
if (entry.bitsPerValue == 0) { public int ordValue() throws IOException {
ords = return (int) ords.longValue();
new LongValues() { }
@Override
public long get(long index) {
return 0L;
}
};
} else {
final RandomAccessInput slice = data.randomAccessSlice(entry.ordsOffset, entry.ordsLength);
ords = DirectReader.getInstance(slice, entry.bitsPerValue);
}
if (entry.docsWithFieldOffset == -1) { @Override
// dense public boolean advanceExact(int target) throws IOException {
return new BaseSortedDocValues(entry, data) { return ords.advanceExact(target);
}
int doc = -1; @Override
public int docID() {
return ords.docID();
}
@Override @Override
public int nextDoc() throws IOException { public int nextDoc() throws IOException {
return advance(doc + 1); return ords.nextDoc();
} }
@Override @Override
public int docID() { public int advance(int target) throws IOException {
return doc; return ords.advance(target);
} }
@Override @Override
public long cost() { public long cost() {
return maxDoc; return ords.cost();
} }
};
@Override
public int advance(int target) throws IOException {
if (target >= maxDoc) {
return doc = NO_MORE_DOCS;
}
return doc = target;
}
@Override
public boolean advanceExact(int target) {
doc = target;
return true;
}
@Override
public int ordValue() {
return (int) ords.get(doc);
}
};
} else {
// sparse
final IndexedDISI disi =
new IndexedDISI(
data,
entry.docsWithFieldOffset,
entry.docsWithFieldLength,
entry.jumpTableEntryCount,
entry.denseRankPower,
entry.numDocsWithField);
return new BaseSortedDocValues(entry, data) {
@Override
public int nextDoc() throws IOException {
return disi.nextDoc();
}
@Override
public int docID() {
return disi.docID();
}
@Override
public long cost() {
return disi.cost();
}
@Override
public int advance(int target) throws IOException {
return disi.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
return disi.advanceExact(target);
}
@Override
public int ordValue() {
return (int) ords.get(disi.index());
}
};
}
} }
private abstract static class BaseSortedDocValues extends SortedDocValues { private abstract static class BaseSortedDocValues extends SortedDocValues {
@ -906,7 +814,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override @Override
public int getValueCount() { public int getValueCount() {
return Math.toIntExact(entry.termsDictSize); return Math.toIntExact(entry.termsDictEntry.termsDictSize);
} }
@Override @Override
@ -930,7 +838,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override @Override
public TermsEnum termsEnum() throws IOException { public TermsEnum termsEnum() throws IOException {
return new TermsDict(entry, data); return new TermsDict(entry.termsDictEntry, data);
} }
} }
@ -948,7 +856,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override @Override
public long getValueCount() { public long getValueCount() {
return entry.termsDictSize; return entry.termsDictEntry.termsDictSize;
} }
@Override @Override
@ -972,7 +880,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override @Override
public TermsEnum termsEnum() throws IOException { public TermsEnum termsEnum() throws IOException {
return new TermsDict(entry, data); return new TermsDict(entry.termsDictEntry, data);
} }
} }
@ -1204,6 +1112,10 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override @Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedNumericEntry entry = sortedNumerics.get(field.name); SortedNumericEntry entry = sortedNumerics.get(field.name);
return getSortedNumeric(entry);
}
private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry) throws IOException {
if (entry.numValues == entry.numDocsWithField) { if (entry.numValues == entry.numDocsWithField) {
return DocValues.singleton(getNumeric(entry)); return DocValues.singleton(getNumeric(entry));
} }
@ -1344,124 +1256,57 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
return DocValues.singleton(getSorted(entry.singleValueEntry)); return DocValues.singleton(getSorted(entry.singleValueEntry));
} }
final RandomAccessInput slice = data.randomAccessSlice(entry.ordsOffset, entry.ordsLength); final SortedNumericDocValues ords = getSortedNumeric(entry.ordsEntry);
final LongValues ords = DirectReader.getInstance(slice, entry.bitsPerValue); return new BaseSortedSetDocValues(entry, data) {
final RandomAccessInput addressesInput = int i = 0;
data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); int count = 0;
final LongValues addresses =
DirectMonotonicReader.getInstance(entry.addressesMeta, addressesInput);
if (entry.docsWithFieldOffset == -1) { @Override
// dense public long nextOrd() throws IOException {
return new BaseSortedSetDocValues(entry, data) { if (i++ == count) {
return NO_MORE_ORDS;
int doc = -1;
long start;
long end;
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
} }
return ords.nextValue();
}
@Override @Override
public int docID() { public boolean advanceExact(int target) throws IOException {
return doc; if (ords.advanceExact(target)) {
} count = ords.docValueCount();
i = 0;
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
if (target >= maxDoc) {
return doc = NO_MORE_DOCS;
}
start = addresses.get(target);
end = addresses.get(target + 1L);
return doc = target;
}
@Override
public boolean advanceExact(int target) throws IOException {
start = addresses.get(target);
end = addresses.get(target + 1L);
doc = target;
return true; return true;
} else {
return false;
} }
}
@Override @Override
public long nextOrd() throws IOException { public int docID() {
if (start == end) { return ords.docID();
return NO_MORE_ORDS; }
}
return ords.get(start++);
}
};
} else {
// sparse
final IndexedDISI disi =
new IndexedDISI(
data,
entry.docsWithFieldOffset,
entry.docsWithFieldLength,
entry.jumpTableEntryCount,
entry.denseRankPower,
entry.numDocsWithField);
return new BaseSortedSetDocValues(entry, data) {
boolean set; @Override
long start; public int nextDoc() throws IOException {
long end = 0; int doc = ords.nextDoc();
count = ords.docValueCount();
i = 0;
return doc;
}
@Override @Override
public int nextDoc() throws IOException { public int advance(int target) throws IOException {
set = false; int doc = ords.advance(target);
return disi.nextDoc(); count = ords.docValueCount();
} i = 0;
return doc;
}
@Override @Override
public int docID() { public long cost() {
return disi.docID(); return ords.cost();
} }
};
@Override
public long cost() {
return disi.cost();
}
@Override
public int advance(int target) throws IOException {
set = false;
return disi.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
set = false;
return disi.advanceExact(target);
}
@Override
public long nextOrd() throws IOException {
if (set == false) {
final int index = disi.index();
final long start = addresses.get(index);
this.start = start + 1;
end = addresses.get(index + 1L);
set = true;
return ords.get(start);
} else if (start == end) {
return NO_MORE_ORDS;
} else {
return ords.get(start++);
}
}
};
}
} }
@Override @Override