diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1981c523426..54329abe7cd 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -29,6 +29,9 @@ Bug Fixes Improvements +* LUCENE-7489: Better storage of sparse doc-values fields with the default + codec. (Adrien Grand) + Optimizations * LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java index 25f4b5e52ae..8a44c312662 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java @@ -928,7 +928,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close } @Override - public TermsEnum termsEnum() { + public TermsEnum termsEnum() throws IOException { if (binary instanceof CompressedBinaryDocValues) { return ((CompressedBinaryDocValues)binary).getTermsEnum(); } else { @@ -1233,7 +1233,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close } @Override - public TermsEnum termsEnum() { + public TermsEnum termsEnum() throws IOException { if (binary instanceof CompressedBinaryDocValues) { return ((CompressedBinaryDocValues)binary).getTermsEnum(); } else { @@ -1292,7 +1292,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close } @Override - public TermsEnum termsEnum() { + public TermsEnum termsEnum() throws IOException { if (binary instanceof CompressedBinaryDocValues) { return ((CompressedBinaryDocValues) binary).getTermsEnum(); } else { @@ -1490,12 +1490,8 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close } } - TermsEnum getTermsEnum() { - try { - return getTermsEnum(data.clone()); - } catch (IOException e) { - throw new RuntimeException(e); - } + TermsEnum getTermsEnum() throws IOException { + return getTermsEnum(data.clone()); } private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index a07cc3d668f..e04d5b9962f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -361,7 +361,7 @@ public abstract class DocValuesConsumer implements Closeable { addSortedNumericField(mergeFieldInfo, new EmptyDocValuesProducer() { @Override - public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) { + public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException { if (fieldInfo != mergeFieldInfo) { throw new IllegalArgumentException("wrong FieldInfo"); } @@ -375,11 +375,7 @@ public abstract class DocValuesConsumer implements Closeable { if (docValuesProducer != null) { FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { - try { - values = docValuesProducer.getSortedNumeric(readerFieldInfo); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + values = docValuesProducer.getSortedNumeric(readerFieldInfo); } } if (values == null) { @@ -391,12 +387,7 @@ public abstract class DocValuesConsumer implements Closeable { final long finalCost = cost; - final DocIDMerger docIDMerger; - try { - docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); return new SortedNumericDocValues() { @@ -521,7 +512,7 @@ public abstract class DocValuesConsumer implements Closeable { addSortedField(fieldInfo, new EmptyDocValuesProducer() { @Override - public SortedDocValues getSorted(FieldInfo fieldInfoIn) { + public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException { if (fieldInfoIn != fieldInfo) { throw new IllegalArgumentException("wrong FieldInfo"); } @@ -536,11 +527,7 @@ public abstract class DocValuesConsumer implements Closeable { if (docValuesProducer != null) { FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { - try { - values = docValuesProducer.getSorted(readerFieldInfo); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + values = docValuesProducer.getSorted(readerFieldInfo); } } if (values == null) { @@ -553,12 +540,7 @@ public abstract class DocValuesConsumer implements Closeable { final long finalCost = cost; - final DocIDMerger docIDMerger; - try { - docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); return new SortedDocValues() { private int docID = -1; @@ -693,7 +675,7 @@ public abstract class DocValuesConsumer implements Closeable { addSortedSetField(mergeFieldInfo, new EmptyDocValuesProducer() { @Override - public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) { + public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException { if (fieldInfo != mergeFieldInfo) { throw new IllegalArgumentException("wrong FieldInfo"); } @@ -709,11 +691,7 @@ public abstract class DocValuesConsumer implements Closeable { if (docValuesProducer != null) { FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { - try { - values = docValuesProducer.getSortedSet(readerFieldInfo); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + values = docValuesProducer.getSortedSet(readerFieldInfo); } } if (values == null) { @@ -723,12 +701,7 @@ public abstract class DocValuesConsumer implements Closeable { subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); } - final DocIDMerger docIDMerger; - try { - docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); final long finalCost = cost; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java index 564db7cbdd1..e1b66e13eb0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java @@ -17,56 +17,47 @@ package org.apache.lucene.codecs.lucene70; +import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; + import java.io.Closeable; // javadocs import java.io.IOException; import java.util.Arrays; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.Map; import java.util.Set; -import java.util.SortedSet; -import java.util.TreeSet; -import java.util.stream.StreamSupport; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.LegacyDocValuesIterables; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.SortedSetSelector; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.MathUtil; -import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; -import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.packed.DirectMonotonicWriter; import org.apache.lucene.util.packed.DirectWriter; -import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; -import org.apache.lucene.util.packed.PackedInts; - -import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.*; /** writer for {@link Lucene70DocValuesFormat} */ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Closeable { - enum NumberType { - /** Dense ordinals */ - ORDINAL, - /** Random long values */ - VALUE; - } - IndexOutput data, meta; final int maxDoc; - + /** expert: Creates a new writer */ public Lucene70DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { boolean success = false; @@ -85,700 +76,13 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close } } } - - @Override - public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { - addNumericField(field, LegacyDocValuesIterables.numericIterable(field, valuesProducer, maxDoc), NumberType.VALUE); - } - - void addNumericField(FieldInfo field, Iterable values, NumberType numberType) throws IOException { - long count = 0; - long minValue = Long.MAX_VALUE; - long maxValue = Long.MIN_VALUE; - long gcd = 0; - long missingCount = 0; - long zeroCount = 0; - // TODO: more efficient? - HashSet uniqueValues = null; - long missingOrdCount = 0; - if (numberType == NumberType.VALUE) { - uniqueValues = new HashSet<>(); - - for (Number nv : values) { - final long v; - if (nv == null) { - v = 0; - missingCount++; - zeroCount++; - } else { - v = nv.longValue(); - if (v == 0) { - zeroCount++; - } - } - - if (gcd != 1) { - if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { - // in that case v - minValue might overflow and make the GCD computation return - // wrong results. Since these extreme values are unlikely, we just discard - // GCD computation for them - gcd = 1; - } else if (count != 0) { // minValue needs to be set first - gcd = MathUtil.gcd(gcd, v - minValue); - } - } - - minValue = Math.min(minValue, v); - maxValue = Math.max(maxValue, v); - - if (uniqueValues != null) { - if (uniqueValues.add(v)) { - if (uniqueValues.size() > 256) { - uniqueValues = null; - } - } - } - - ++count; - } - } else { - for (Number nv : values) { - long v = nv.longValue(); - if (v == -1L) { - missingOrdCount++; - } - minValue = Math.min(minValue, v); - maxValue = Math.max(maxValue, v); - ++count; - } - } - - final long delta = maxValue - minValue; - final int deltaBitsRequired = DirectWriter.unsignedBitsRequired(delta); - final int tableBitsRequired = uniqueValues == null - ? Integer.MAX_VALUE - : DirectWriter.bitsRequired(uniqueValues.size() - 1); - - final boolean sparse; // 10% of docs or less have a value - switch (numberType) { - case VALUE: - sparse = (double) missingCount / count >= 0.90; - break; - case ORDINAL: - sparse = (double) missingOrdCount / count >= 0.90; - break; - default: - throw new AssertionError(); - } - - final int format; - if (uniqueValues != null - && count <= Integer.MAX_VALUE - && (uniqueValues.size() == 1 - || (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) { - // either one unique value C or two unique values: "missing" and C - format = CONST_COMPRESSED; - } else if (sparse && count >= 1024) { - // require at least 1024 docs to avoid flipping back and forth when doing NRT search - format = SPARSE_COMPRESSED; - } else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) { - format = TABLE_COMPRESSED; - } else if (gcd != 0 && gcd != 1) { - final long gcdDelta = (maxValue - minValue) / gcd; - final long gcdBitsRequired = DirectWriter.unsignedBitsRequired(gcdDelta); - format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED; - } else { - format = DELTA_COMPRESSED; - } - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.NUMERIC); - meta.writeVInt(format); - if (format == SPARSE_COMPRESSED) { - meta.writeLong(data.getFilePointer()); - final long numDocsWithValue; - switch (numberType) { - case VALUE: - numDocsWithValue = count - missingCount; - break; - case ORDINAL: - numDocsWithValue = count - missingOrdCount; - break; - default: - throw new AssertionError(); - } - final long maxDoc = writeSparseMissingBitset(values, numberType, numDocsWithValue); - assert maxDoc == count; - } else if (missingCount == 0) { - meta.writeLong(ALL_LIVE); - } else if (missingCount == count) { - meta.writeLong(ALL_MISSING); - } else { - meta.writeLong(data.getFilePointer()); - writeMissingBitset(values); - } - meta.writeLong(data.getFilePointer()); - meta.writeVLong(count); - - switch (format) { - case CONST_COMPRESSED: - // write the constant (nonzero value in the n=2 case, singleton value otherwise) - meta.writeLong(minValue < 0 ? Collections.min(uniqueValues) : Collections.max(uniqueValues)); - break; - case GCD_COMPRESSED: - meta.writeLong(minValue); - meta.writeLong(gcd); - final long maxDelta = (maxValue - minValue) / gcd; - final int bits = DirectWriter.unsignedBitsRequired(maxDelta); - meta.writeVInt(bits); - final DirectWriter quotientWriter = DirectWriter.getInstance(data, count, bits); - for (Number nv : values) { - long value = nv == null ? 0 : nv.longValue(); - quotientWriter.add((value - minValue) / gcd); - } - quotientWriter.finish(); - break; - case DELTA_COMPRESSED: - final long minDelta = delta < 0 ? 0 : minValue; - meta.writeLong(minDelta); - meta.writeVInt(deltaBitsRequired); - final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired); - for (Number nv : values) { - long v = nv == null ? 0 : nv.longValue(); - writer.add(v - minDelta); - } - writer.finish(); - break; - case TABLE_COMPRESSED: - final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); - Arrays.sort(decode); - final HashMap encode = new HashMap<>(); - meta.writeVInt(decode.length); - for (int i = 0; i < decode.length; i++) { - meta.writeLong(decode[i]); - encode.put(decode[i], i); - } - meta.writeVInt(tableBitsRequired); - final DirectWriter ordsWriter = DirectWriter.getInstance(data, count, tableBitsRequired); - for (Number nv : values) { - ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue())); - } - ordsWriter.finish(); - break; - case SPARSE_COMPRESSED: - final Iterable filteredMissingValues; - switch (numberType) { - case VALUE: - meta.writeByte((byte) 0); - filteredMissingValues = new Iterable() { - @Override - public Iterator iterator() { - return StreamSupport - .stream(values.spliterator(), false) - .filter(value -> value != null) - .iterator(); - } - }; - break; - case ORDINAL: - meta.writeByte((byte) 1); - filteredMissingValues = new Iterable() { - @Override - public Iterator iterator() { - return StreamSupport - .stream(values.spliterator(), false) - .filter(value -> value.longValue() != -1L) - .iterator(); - } - }; - break; - default: - throw new AssertionError(); - } - // Write non-missing values as a numeric field - addNumericField(field, filteredMissingValues, numberType); - break; - default: - throw new AssertionError(); - } - meta.writeLong(data.getFilePointer()); - } - - // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, - // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) - void writeMissingBitset(Iterable values) throws IOException { - long bits = 0; - int count = 0; - for (Object v : values) { - if (count == 64) { - data.writeLong(bits); - count = 0; - bits = 0; - } - if (v != null) { - bits |= 1L << count; - } - count++; - } - if (count > 0) { - data.writeLong(bits); - } - } - - long writeSparseMissingBitset(Iterable values, NumberType numberType, long numDocsWithValue) throws IOException { - meta.writeVLong(numDocsWithValue); - - // Write doc IDs that have a value - meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); - final DirectMonotonicWriter docIdsWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithValue, DIRECT_MONOTONIC_BLOCK_SHIFT); - long docID = 0; - for (Number nv : values) { - switch (numberType) { - case VALUE: - if (nv != null) { - docIdsWriter.add(docID); - } - break; - case ORDINAL: - if (nv.longValue() != -1L) { - docIdsWriter.add(docID); - } - break; - default: - throw new AssertionError(); - } - docID++; - } - docIdsWriter.finish(); - return docID; - } - - @Override - public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { - addBinaryField(field, LegacyDocValuesIterables.binaryIterable(field, valuesProducer, maxDoc)); - } - - private void addBinaryField(FieldInfo field, Iterable values) throws IOException { - // write the byte[] data - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.BINARY); - int minLength = Integer.MAX_VALUE; - int maxLength = Integer.MIN_VALUE; - final long startFP = data.getFilePointer(); - long count = 0; - long missingCount = 0; - for(BytesRef v : values) { - final int length; - if (v == null) { - length = 0; - missingCount++; - } else { - length = v.length; - } - minLength = Math.min(minLength, length); - maxLength = Math.max(maxLength, length); - if (v != null) { - data.writeBytes(v.bytes, v.offset, v.length); - } - count++; - } - meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED); - if (missingCount == 0) { - meta.writeLong(ALL_LIVE); - } else if (missingCount == count) { - meta.writeLong(ALL_MISSING); - } else { - meta.writeLong(data.getFilePointer()); - writeMissingBitset(values); - } - meta.writeVInt(minLength); - meta.writeVInt(maxLength); - meta.writeVLong(count); - meta.writeLong(startFP); - - // if minLength == maxLength, it's a fixed-length byte[], we are done (the addresses are implicit) - // otherwise, we need to record the length fields... - if (minLength != maxLength) { - meta.writeLong(data.getFilePointer()); - meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); - - final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, count + 1, DIRECT_MONOTONIC_BLOCK_SHIFT); - long addr = 0; - writer.add(addr); - for (BytesRef v : values) { - if (v != null) { - addr += v.length; - } - writer.add(addr); - } - writer.finish(); - meta.writeLong(data.getFilePointer()); - } - } - - /** expert: writes a value dictionary for a sorted/sortedset field */ - private void addTermsDict(FieldInfo field, final Iterable values) throws IOException { - // first check if it's a "fixed-length" terms dict, and compressibility if so - int minLength = Integer.MAX_VALUE; - int maxLength = Integer.MIN_VALUE; - long numValues = 0; - BytesRefBuilder previousValue = new BytesRefBuilder(); - long prefixSum = 0; // only valid for fixed-width data, as we have a choice there - for (BytesRef v : values) { - minLength = Math.min(minLength, v.length); - maxLength = Math.max(maxLength, v.length); - if (minLength == maxLength) { - int termPosition = (int) (numValues & INTERVAL_MASK); - if (termPosition == 0) { - // first term in block, save it away to compare against the last term later - previousValue.copyBytes(v); - } else if (termPosition == INTERVAL_COUNT - 1) { - // last term in block, accumulate shared prefix against first term - prefixSum += StringHelper.bytesDifference(previousValue.get(), v); - } - } - numValues++; - } - // for fixed width data, look at the avg(shared prefix) before deciding how to encode: - // prefix compression "costs" worst case 2 bytes per term because we must store suffix lengths. - // so if we share at least 3 bytes on average, always compress. - if (minLength == maxLength && prefixSum <= 3*(numValues >> INTERVAL_SHIFT)) { - // no index needed: not very compressible, direct addressing by mult - addBinaryField(field, values); - } else if (numValues < REVERSE_INTERVAL_COUNT) { - // low cardinality: waste a few KB of ram, but can't really use fancy index etc - addBinaryField(field, values); - } else { - assert numValues > 0; // we don't have to handle the empty case - // header - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.BINARY); - meta.writeVInt(BINARY_PREFIX_COMPRESSED); - meta.writeLong(-1L); - // now write the bytes: sharing prefixes within a block - final long startFP = data.getFilePointer(); - // currently, we have to store the delta from expected for every 1/nth term - // we could avoid this, but it's not much and less overall RAM than the previous approach! - RAMOutputStream addressBuffer = new RAMOutputStream(); - MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE); - // buffers up 16 terms - RAMOutputStream bytesBuffer = new RAMOutputStream(); - // buffers up block header - RAMOutputStream headerBuffer = new RAMOutputStream(); - BytesRefBuilder lastTerm = new BytesRefBuilder(); - lastTerm.grow(maxLength); - long count = 0; - int suffixDeltas[] = new int[INTERVAL_COUNT]; - for (BytesRef v : values) { - int termPosition = (int) (count & INTERVAL_MASK); - if (termPosition == 0) { - termAddresses.add(data.getFilePointer() - startFP); - // abs-encode first term - headerBuffer.writeVInt(v.length); - headerBuffer.writeBytes(v.bytes, v.offset, v.length); - lastTerm.copyBytes(v); - } else { - // prefix-code: we only share at most 255 characters, to encode the length as a single - // byte and have random access. Larger terms just get less compression. - int sharedPrefix = Math.min(255, StringHelper.bytesDifference(lastTerm.get(), v)); - bytesBuffer.writeByte((byte) sharedPrefix); - bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix); - // we can encode one smaller, because terms are unique. - suffixDeltas[termPosition] = v.length - sharedPrefix - 1; - } - - count++; - // flush block - if ((count & INTERVAL_MASK) == 0) { - flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas); - } - } - // flush trailing crap - int leftover = (int) (count & INTERVAL_MASK); - if (leftover > 0) { - Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0); - flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas); - } - final long indexStartFP = data.getFilePointer(); - // write addresses of indexed terms - termAddresses.finish(); - addressBuffer.writeTo(data); - addressBuffer = null; - termAddresses = null; - meta.writeVInt(minLength); - meta.writeVInt(maxLength); - meta.writeVLong(count); - meta.writeLong(startFP); - meta.writeLong(indexStartFP); - meta.writeVInt(PackedInts.VERSION_CURRENT); - meta.writeVInt(MONOTONIC_BLOCK_SIZE); - addReverseTermIndex(field, values, maxLength); - } - } - // writes term dictionary "block" - // first term is absolute encoded as vint length + bytes. - // lengths of subsequent N terms are encoded as either N bytes or N shorts. - // in the double-byte case, the first byte is indicated with -1. - // subsequent terms are encoded as byte suffixLength + bytes. - private void flushTermsDictBlock(RAMOutputStream headerBuffer, RAMOutputStream bytesBuffer, int suffixDeltas[]) throws IOException { - boolean twoByte = false; - for (int i = 1; i < suffixDeltas.length; i++) { - if (suffixDeltas[i] > 254) { - twoByte = true; - } - } - if (twoByte) { - headerBuffer.writeByte((byte)255); - for (int i = 1; i < suffixDeltas.length; i++) { - headerBuffer.writeShort((short) suffixDeltas[i]); - } - } else { - for (int i = 1; i < suffixDeltas.length; i++) { - headerBuffer.writeByte((byte) suffixDeltas[i]); - } - } - headerBuffer.writeTo(data); - headerBuffer.reset(); - bytesBuffer.writeTo(data); - bytesBuffer.reset(); - } - - // writes reverse term index: used for binary searching a term into a range of 64 blocks - // for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison - // terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries. - private void addReverseTermIndex(FieldInfo field, final Iterable values, int maxLength) throws IOException { - long count = 0; - BytesRefBuilder priorTerm = new BytesRefBuilder(); - priorTerm.grow(maxLength); - BytesRef indexTerm = new BytesRef(); - long startFP = data.getFilePointer(); - PagedBytes pagedBytes = new PagedBytes(15); - MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE); - - for (BytesRef b : values) { - int termPosition = (int) (count & REVERSE_INTERVAL_MASK); - if (termPosition == 0) { - int len = StringHelper.sortKeyLength(priorTerm.get(), b); - indexTerm.bytes = b.bytes; - indexTerm.offset = b.offset; - indexTerm.length = len; - addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm)); - } else if (termPosition == REVERSE_INTERVAL_MASK) { - priorTerm.copyBytes(b); - } - count++; - } - addresses.finish(); - long numBytes = pagedBytes.getPointer(); - pagedBytes.freeze(true); - PagedBytesDataInput in = pagedBytes.getDataInput(); - meta.writeLong(startFP); - data.writeVLong(numBytes); - data.copyBytes(in, numBytes); - } - - @Override - public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.SORTED); - addTermsDict(field, LegacyDocValuesIterables.valuesIterable(valuesProducer.getSorted(field))); - addNumericField(field, LegacyDocValuesIterables.sortedOrdIterable(valuesProducer, field, maxDoc), NumberType.ORDINAL); - } - - private void addSortedField(FieldInfo field, Iterable values, Iterable ords) throws IOException { - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.SORTED); - addTermsDict(field, values); - addNumericField(field, ords, NumberType.ORDINAL); - } - - @Override - public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException { - - final Iterable docToValueCount = LegacyDocValuesIterables.sortedNumericToDocCount(valuesProducer, field, maxDoc); - final Iterable values = LegacyDocValuesIterables.sortedNumericToValues(valuesProducer, field); - - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.SORTED_NUMERIC); - if (isSingleValued(docToValueCount)) { - meta.writeVInt(SORTED_SINGLE_VALUED); - // The field is single-valued, we can encode it as NUMERIC - addNumericField(field, singletonView(docToValueCount, values, null), NumberType.VALUE); - } else { - final SortedSet uniqueValueSets = uniqueValueSets(docToValueCount, values); - if (uniqueValueSets != null) { - meta.writeVInt(SORTED_SET_TABLE); - - // write the set_id -> values mapping - writeDictionary(uniqueValueSets); - - // write the doc -> set_id as a numeric field - addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), NumberType.ORDINAL); - } else { - meta.writeVInt(SORTED_WITH_ADDRESSES); - // write the stream of values as a numeric field - addNumericField(field, values, NumberType.VALUE); - // write the doc -> ord count as a absolute index to the stream - addOrdIndex(field, docToValueCount); - } - } - } - - @Override - public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { - - Iterable values = LegacyDocValuesIterables.valuesIterable(valuesProducer.getSortedSet(field)); - Iterable docToOrdCount = LegacyDocValuesIterables.sortedSetOrdCountIterable(valuesProducer, field, maxDoc); - Iterable ords = LegacyDocValuesIterables.sortedSetOrdsIterable(valuesProducer, field); - - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.SORTED_SET); - - if (isSingleValued(docToOrdCount)) { - meta.writeVInt(SORTED_SINGLE_VALUED); - // The field is single-valued, we can encode it as SORTED - addSortedField(field, values, singletonView(docToOrdCount, ords, -1L)); - } else { - final SortedSet uniqueValueSets = uniqueValueSets(docToOrdCount, ords); - if (uniqueValueSets != null) { - meta.writeVInt(SORTED_SET_TABLE); - - // write the set_id -> ords mapping - writeDictionary(uniqueValueSets); - - // write the ord -> byte[] as a binary field - addTermsDict(field, values); - - // write the doc -> set_id as a numeric field - addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), NumberType.ORDINAL); - } else { - meta.writeVInt(SORTED_WITH_ADDRESSES); - - // write the ord -> byte[] as a binary field - addTermsDict(field, values); - - // write the stream of ords as a numeric field - // NOTE: we could return an iterator that delta-encodes these within a doc - addNumericField(field, ords, NumberType.ORDINAL); - - // write the doc -> ord count as a absolute index to the stream - addOrdIndex(field, docToOrdCount); - } - } - } - - private SortedSet uniqueValueSets(Iterable docToValueCount, Iterable values) { - Set uniqueValueSet = new HashSet<>(); - LongsRef docValues = new LongsRef(256); - - Iterator valueCountIterator = docToValueCount.iterator(); - Iterator valueIterator = values.iterator(); - int totalDictSize = 0; - while (valueCountIterator.hasNext()) { - docValues.length = valueCountIterator.next().intValue(); - if (docValues.length > 256) { - return null; - } - for (int i = 0; i < docValues.length; ++i) { - docValues.longs[i] = valueIterator.next().longValue(); - } - if (uniqueValueSet.contains(docValues)) { - continue; - } - totalDictSize += docValues.length; - if (totalDictSize > 256) { - return null; - } - uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length)); - } - assert valueIterator.hasNext() == false; - return new TreeSet<>(uniqueValueSet); - } - - private void writeDictionary(SortedSet uniqueValueSets) throws IOException { - int lengthSum = 0; - for (LongsRef longs : uniqueValueSets) { - lengthSum += longs.length; - } - - meta.writeInt(lengthSum); - for (LongsRef valueSet : uniqueValueSets) { - for (int i = 0; i < valueSet.length; ++i) { - meta.writeLong(valueSet.longs[valueSet.offset + i]); - } - } - - meta.writeInt(uniqueValueSets.size()); - for (LongsRef valueSet : uniqueValueSets) { - meta.writeInt(valueSet.length); - } - } - - private Iterable docToSetId(SortedSet uniqueValueSets, Iterable docToValueCount, Iterable values) { - final Map setIds = new HashMap<>(); - int i = 0; - for (LongsRef set : uniqueValueSets) { - setIds.put(set, i++); - } - assert i == uniqueValueSets.size(); - - return new Iterable() { - - @Override - public Iterator iterator() { - final Iterator valueCountIterator = docToValueCount.iterator(); - final Iterator valueIterator = values.iterator(); - final LongsRef docValues = new LongsRef(256); - return new Iterator() { - - @Override - public boolean hasNext() { - return valueCountIterator.hasNext(); - } - - @Override - public Number next() { - docValues.length = valueCountIterator.next().intValue(); - for (int i = 0; i < docValues.length; ++i) { - docValues.longs[i] = valueIterator.next().longValue(); - } - final Integer id = setIds.get(docValues); - assert id != null; - return id; - } - - }; - - } - }; - } - - // writes addressing information as MONOTONIC_COMPRESSED integer - private void addOrdIndex(FieldInfo field, Iterable values) throws IOException { - meta.writeVInt(field.number); - meta.writeByte(Lucene70DocValuesFormat.NUMERIC); - meta.writeVInt(MONOTONIC_COMPRESSED); - meta.writeLong(-1L); - meta.writeLong(data.getFilePointer()); - meta.writeVLong(maxDoc); - meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); - - final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, maxDoc + 1, DIRECT_MONOTONIC_BLOCK_SHIFT); - long addr = 0; - writer.add(addr); - for (Number v : values) { - addr += v.longValue(); - writer.add(addr); - } - writer.finish(); - meta.writeLong(data.getFilePointer()); - } @Override public void close() throws IOException { boolean success = false; try { if (meta != null) { - meta.writeVInt(-1); // write EOF marker + meta.writeInt(-1); // write EOF marker CodecUtil.writeFooter(meta); // write checksum } if (data != null) { @@ -794,4 +98,425 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close meta = data = null; } } + + @Override + public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + meta.writeInt(field.number); + meta.writeByte(Lucene70DocValuesFormat.NUMERIC); + + writeValues(field, new EmptyDocValuesProducer() { + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + return DocValues.singleton(valuesProducer.getNumeric(field)); + } + }); + } + + private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); + int numDocsWithValue = 0; + long numValues = 0; + long min = Long.MAX_VALUE; + long max = Long.MIN_VALUE; + long gcd = 0; + Set uniqueValues = new HashSet<>(); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (int i = 0, count = values.docValueCount(); i < count; ++i) { + long v = values.nextValue(); + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (numValues != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - min); + } + } + + min = Math.min(min, v); + max = Math.max(max, v); + + if (uniqueValues != null + && uniqueValues.add(v) + && uniqueValues.size() > 256) { + uniqueValues = null; + } + + numValues++; + } + + numDocsWithValue++; + } + + if (numDocsWithValue == 0) { + meta.writeLong(-2); + meta.writeLong(0L); + } else if (numDocsWithValue == maxDoc) { + meta.writeLong(-1); + meta.writeLong(0L); + } else { + long offset = data.getFilePointer(); + meta.writeLong(offset); + values = valuesProducer.getSortedNumeric(field); + IndexedDISI.writeBitSet(values, data); + meta.writeLong(data.getFilePointer() - offset); + } + + meta.writeLong(numValues); + final int numBitsPerValue; + Map encode = null; + if (min >= max) { + numBitsPerValue = 0; + meta.writeInt(-1); + } else { + if (uniqueValues != null + && uniqueValues.size() > 1 + && DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) < DirectWriter.unsignedBitsRequired((max - min) / gcd)) { + numBitsPerValue = DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1); + final Long[] sortedUniqueValues = uniqueValues.toArray(new Long[0]); + Arrays.sort(sortedUniqueValues); + meta.writeInt(sortedUniqueValues.length); + for (Long v : sortedUniqueValues) { + meta.writeLong(v); + } + encode = new HashMap<>(); + for (int i = 0; i < sortedUniqueValues.length; ++i) { + encode.put(sortedUniqueValues[i], i); + } + min = 0; + gcd = 1; + } else { + uniqueValues = null; + numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd); + if (gcd == 1 && min > 0 + && DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) { + min = 0; + } + meta.writeInt(-1); + } + } + + meta.writeByte((byte) numBitsPerValue); + meta.writeLong(min); + meta.writeLong(gcd); + long startOffset = data.getFilePointer(); + meta.writeLong(startOffset); + if (numBitsPerValue != 0) { + values = valuesProducer.getSortedNumeric(field); + DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (int i = 0, count = values.docValueCount(); i < count; ++i) { + long v = values.nextValue(); + if (encode == null) { + writer.add((v - min) / gcd); + } else { + writer.add(encode.get(v)); + } + } + } + writer.finish(); + } + meta.writeLong(data.getFilePointer() - startOffset); + + return new long[] {numDocsWithValue, numValues}; + } + + @Override + public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + meta.writeInt(field.number); + meta.writeByte(Lucene70DocValuesFormat.BINARY); + + BinaryDocValues values = valuesProducer.getBinary(field); + long start = data.getFilePointer(); + meta.writeLong(start); + int numDocsWithField = 0; + int minLength = Integer.MAX_VALUE; + int maxLength = 0; + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + numDocsWithField++; + BytesRef v = values.binaryValue(); + int length = v.length; + data.writeBytes(v.bytes, v.offset, v.length); + minLength = Math.min(length, minLength); + maxLength = Math.max(length, maxLength); + } + assert numDocsWithField <= maxDoc; + meta.writeLong(data.getFilePointer() - start); + + if (numDocsWithField == 0) { + meta.writeLong(-2); + meta.writeLong(0L); + } else if (numDocsWithField == maxDoc) { + meta.writeLong(-1); + meta.writeLong(0L); + } else { + long offset = data.getFilePointer(); + meta.writeLong(offset); + values = valuesProducer.getBinary(field); + IndexedDISI.writeBitSet(values, data); + meta.writeLong(data.getFilePointer() - offset); + } + + meta.writeInt(numDocsWithField); + meta.writeInt(minLength); + meta.writeInt(maxLength); + if (maxLength > minLength) { + start = data.getFilePointer(); + meta.writeLong(start); + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + + final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT); + long addr = 0; + writer.add(addr); + values = valuesProducer.getBinary(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + addr += values.binaryValue().length; + writer.add(addr); + } + writer.finish(); + meta.writeLong(data.getFilePointer() - start); + } + } + + @Override + public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + meta.writeInt(field.number); + meta.writeByte(Lucene70DocValuesFormat.SORTED); + doAddSortedField(field, valuesProducer); + } + + private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + SortedDocValues values = valuesProducer.getSorted(field); + int numDocsWithField = 0; + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + numDocsWithField++; + } + + if (numDocsWithField == 0) { + meta.writeLong(-2); + meta.writeLong(0L); + } else if (numDocsWithField == maxDoc) { + meta.writeLong(-1); + meta.writeLong(0L); + } else { + long offset = data.getFilePointer(); + meta.writeLong(offset); + values = valuesProducer.getSorted(field); + IndexedDISI.writeBitSet(values, data); + meta.writeLong(data.getFilePointer() - offset); + } + + meta.writeInt(numDocsWithField); + if (values.getValueCount() <= 1) { + meta.writeByte((byte) 0); + meta.writeLong(0L); + meta.writeLong(0L); + } else { + int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1); + meta.writeByte((byte) numberOfBitsPerOrd); + long start = data.getFilePointer(); + meta.writeLong(start); + DirectWriter writer = DirectWriter.getInstance(data, numDocsWithField, numberOfBitsPerOrd); + values = valuesProducer.getSorted(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + writer.add(values.ordValue()); + } + writer.finish(); + meta.writeLong(data.getFilePointer() - start); + } + + addTermsDict(DocValues.singleton(valuesProducer.getSorted(field))); + } + + private void addTermsDict(SortedSetDocValues values) throws IOException { + final long size = values.getValueCount(); + meta.writeVLong(size); + meta.writeInt(Lucene70DocValuesFormat.TERMS_DICT_BLOCK_SHIFT); + + RAMOutputStream addressBuffer = new RAMOutputStream(); + meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + long numBlocks = (size + Lucene70DocValuesFormat.TERMS_DICT_BLOCK_MASK) >>> Lucene70DocValuesFormat.TERMS_DICT_BLOCK_SHIFT; + DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, addressBuffer, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT); + + BytesRefBuilder previous = new BytesRefBuilder(); + long ord = 0; + long start = data.getFilePointer(); + int maxLength = 0; + TermsEnum iterator = values.termsEnum(); + for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { + if ((ord & Lucene70DocValuesFormat.TERMS_DICT_BLOCK_MASK) == 0) { + writer.add(data.getFilePointer() - start); + data.writeVInt(term.length); + data.writeBytes(term.bytes, term.offset, term.length); + } else { + final int prefixLength = StringHelper.bytesDifference(previous.get(), term); + final int suffixLength = term.length - prefixLength; + assert suffixLength > 0; // terms are unique + + data.writeByte((byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4))); + if (prefixLength >= 15) { + data.writeVInt(prefixLength - 15); + } + if (suffixLength >= 16) { + data.writeVInt(suffixLength - 16); + } + data.writeBytes(term.bytes, term.offset + prefixLength, term.length - prefixLength); + } + maxLength = Math.max(maxLength, term.length); + previous.copyBytes(term); + ++ord; + } + writer.finish(); + meta.writeInt(maxLength); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + start = data.getFilePointer(); + addressBuffer.writeTo(data); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + + // Now write the reverse terms index + writeTermsIndex(values); + } + + private void writeTermsIndex(SortedSetDocValues values) throws IOException { + final long size = values.getValueCount(); + meta.writeInt(Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT); + long start = data.getFilePointer(); + + long numBlocks = 1L + ((size + Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) >>> Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT); + RAMOutputStream addressBuffer = new RAMOutputStream(); + DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, addressBuffer, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT); + + TermsEnum iterator = values.termsEnum(); + BytesRefBuilder previous = new BytesRefBuilder(); + long offset = 0; + long ord = 0; + for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { + if ((ord & Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == 0) { + writer.add(offset); + int sortKeyLength = StringHelper.sortKeyLength(previous.get(), term); + offset += sortKeyLength; + data.writeBytes(term.bytes, term.offset, sortKeyLength); + } else if ((ord & Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) { + previous.copyBytes(term); + } + ++ord; + } + writer.add(offset); + writer.finish(); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + start = data.getFilePointer(); + addressBuffer.writeTo(data); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + } + + @Override + public void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + meta.writeInt(field.number); + meta.writeByte(Lucene70DocValuesFormat.SORTED_NUMERIC); + + long[] stats = writeValues(field, valuesProducer); + int numDocsWithField = Math.toIntExact(stats[0]); + long numValues = stats[1]; + assert numValues >= numDocsWithField; + + meta.writeInt(numDocsWithField); + if (numValues > numDocsWithField) { + long start = data.getFilePointer(); + meta.writeLong(start); + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + + final DirectMonotonicWriter addressesWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1L, DIRECT_MONOTONIC_BLOCK_SHIFT); + long addr = 0; + addressesWriter.add(addr); + SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + addr += values.docValueCount(); + addressesWriter.add(addr); + } + addressesWriter.finish(); + meta.writeLong(data.getFilePointer() - start); + } + } + + @Override + public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + meta.writeInt(field.number); + meta.writeByte(Lucene70DocValuesFormat.SORTED_SET); + + SortedSetDocValues values = valuesProducer.getSortedSet(field); + int numDocsWithField = 0; + long numOrds = 0; + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + numDocsWithField++; + for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) { + numOrds++; + } + } + + if (numDocsWithField == numOrds) { + meta.writeByte((byte) 0); + doAddSortedField(field, new EmptyDocValuesProducer() { + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + return SortedSetSelector.wrap(valuesProducer.getSortedSet(field), SortedSetSelector.Type.MIN); + } + }); + return; + } + meta.writeByte((byte) 1); + + assert numDocsWithField != 0; + if (numDocsWithField == maxDoc) { + meta.writeLong(-1); + meta.writeLong(0L); + } else { + long offset = data.getFilePointer(); + meta.writeLong(offset); + values = valuesProducer.getSortedSet(field); + IndexedDISI.writeBitSet(values, data); + meta.writeLong(data.getFilePointer() - offset); + } + + int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1); + meta.writeByte((byte) numberOfBitsPerOrd); + long start = data.getFilePointer(); + meta.writeLong(start); + DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd); + values = valuesProducer.getSortedSet(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) { + writer.add(ord); + } + } + writer.finish(); + meta.writeLong(data.getFilePointer() - start); + + meta.writeInt(numDocsWithField); + start = data.getFilePointer(); + meta.writeLong(start); + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + + final DirectMonotonicWriter addressesWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT); + long addr = 0; + addressesWriter.add(addr); + values = valuesProducer.getSortedSet(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + values.nextOrd(); + addr++; + while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) { + addr++; + } + addressesWriter.add(addr); + } + addressesWriter.finish(); + meta.writeLong(data.getFilePointer() - start); + + addTermsDict(values); + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java index 2112341415c..ee477d666ee 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java @@ -23,39 +23,64 @@ import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.SmallFloat; import org.apache.lucene.util.packed.DirectWriter; /** * Lucene 7.0 DocValues format. *

- * Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies: + * Documents that have a value for the field are encoded in a way that it is always possible to + * know the ordinal of the current document in the set of documents that have a value. For instance, + * say the set of documents that have a value for the field is {1, 5, 6, 11}. When the + * iterator is on 6, it knows that this is the 3rd item of the set. This way, values can + * be stored densely and accessed based on their index at search time. If all documents in a segment + * have a value for the field, the index is the same as the doc ID, so this case is encoded implicitly + * and is very fast at query time. On the other hand if some documents are missing a value for the + * field then the set of documents that have a value is encoded into blocks. All doc IDs that share + * the same upper 16 bits are encoded into the same block with the following strategies: + *

    + *
  • SPARSE: This strategy is used when a block contains at most 4095 documents. The lower 16 + * bits of doc IDs are stored as {@link DataOutput#writeShort(short) shorts} while the upper + * 16 bits are given by the block ID. + *
  • DENSE: This strategy is used when a block contains between 4096 and 65535 documents. The + * lower bits of doc IDs are stored in a bit set. Advancing is performed using + * {@link Long#numberOfTrailingZeros(long) ntz} operations while the index is computed by + * accumulating the {@link Long#bitCount(long) bit counts} of the visited longs. + *
  • ALL: This strategy is used when a block contains exactly 65536 documents, meaning that + * the block is full. In that case doc IDs do not need to be stored explicitly. This is + * typically faster than both SPARSE and DENSE which is a reason why it is preferable to have + * all documents that have a value for a field using contiguous doc IDs, for instance by + * using {@link IndexWriterConfig#setIndexSort(org.apache.lucene.search.Sort) index sorting}. + *
+ *

+ * Then the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) are + * encoded using the following strategies: *

* {@link DocValuesType#NUMERIC NUMERIC}: *

    *
  • Delta-compressed: per-document integers written as deltas from the minimum value, * compressed with bitpacking. For more information, see {@link DirectWriter}. *
  • Table-compressed: when the number of unique values is very small (< 256), and - * when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), - * a lookup table is written instead. Each per-document entry is instead the ordinal - * to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}). + * when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), + * a lookup table is written instead. Each per-document entry is instead the ordinal + * to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}). *
  • GCD-compressed: when all numbers share a common divisor, such as dates, the greatest * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics. *
  • Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written * as blocks of bitpacked integers, encoding the deviation from the expected delta. - *
  • Const-compressed: when there is only one possible non-missing value, only the missing - * bitset is encoded. - *
  • Sparse-compressed: only documents with a value are stored, and lookups are performed - * using binary search. + *
  • Const-compressed: when there is only one possible value, no per-document data is needed and + * this value is encoded alone. *
*

* {@link DocValuesType#BINARY BINARY}: *

    *
  • Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length. - * Each document's value can be addressed directly with multiplication ({@code docID * length}). - *
  • Variable-width Binary: one large concatenated byte[] is written, along with end addresses + * Each document's value can be addressed directly with multiplication ({@code docID * length}). + *
  • Variable-width Binary: one large concatenated byte[] is written, along with end addresses * for each document. The addresses are written as Monotonic-compressed numerics. *
  • Prefix-compressed Binary: values are written in chunks of 16, with the first value written * completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed @@ -64,27 +89,21 @@ import org.apache.lucene.util.packed.DirectWriter; *

    * {@link DocValuesType#SORTED SORTED}: *

      - *
    • Sorted: a mapping of ordinals to deduplicated terms is written as Binary, + *
    • Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-compressed Binary, * along with the per-document ordinals written using one of the numeric strategies above. *
    *

    * {@link DocValuesType#SORTED_SET SORTED_SET}: *

      *
    • Single: if all documents have 0 or 1 value, then data are written like SORTED. - *
    • SortedSet table: when there are few unique sets of values (< 256) then each set is assigned - * an id, a lookup table is written and the mapping from document to set id is written using the - * numeric strategies above. - *
    • SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, - * an ordinal list and per-document index into this list are written using the numeric strategies + *
    • SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, + * an ordinal list and per-document index into this list are written using the numeric strategies * above. *
    *

    * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}: *

      *
    • Single: if all documents have 0 or 1 value, then data are written like NUMERIC. - *
    • SortedSet table: when there are few unique sets of values (< 256) then each set is assigned - * an id, a lookup table is written and the mapping from document to set id is written using the - * numeric strategies above. *
    • SortedNumeric: a value list and per-document index into this list are written using the numeric * strategies above. *
    @@ -112,72 +131,30 @@ public final class Lucene70DocValuesFormat extends DocValuesFormat { public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { return new Lucene70DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } - - static final String DATA_CODEC = "Lucene54DocValuesData"; + + static final String DATA_CODEC = "Lucene70DocValuesData"; static final String DATA_EXTENSION = "dvd"; - static final String META_CODEC = "Lucene54DocValuesMetadata"; + static final String META_CODEC = "Lucene70DocValuesMetadata"; static final String META_EXTENSION = "dvm"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - + // indicates docvalues type static final byte NUMERIC = 0; static final byte BINARY = 1; static final byte SORTED = 2; static final byte SORTED_SET = 3; static final byte SORTED_NUMERIC = 4; - - // address terms in blocks of 16 terms - static final int INTERVAL_SHIFT = 4; - static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT; - static final int INTERVAL_MASK = INTERVAL_COUNT - 1; - - // build reverse index from every 1024th term - static final int REVERSE_INTERVAL_SHIFT = 10; - static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT; - static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1; - - // for conversion from reverse index to block - static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT; - static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT; - static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1; - /** Compressed using packed blocks of ints. */ - static final int DELTA_COMPRESSED = 0; - /** Compressed by computing the GCD. */ - static final int GCD_COMPRESSED = 1; - /** Compressed by giving IDs to unique values. */ - static final int TABLE_COMPRESSED = 2; - /** Compressed with monotonically increasing values */ - static final int MONOTONIC_COMPRESSED = 3; - /** Compressed with constant value (uses only missing bitset) */ - static final int CONST_COMPRESSED = 4; - /** Compressed with sparse arrays. */ - static final int SPARSE_COMPRESSED = 5; - - /** Uncompressed binary, written directly (fixed length). */ - static final int BINARY_FIXED_UNCOMPRESSED = 0; - /** Uncompressed binary, written directly (variable length). */ - static final int BINARY_VARIABLE_UNCOMPRESSED = 1; - /** Compressed binary with shared prefixes */ - static final int BINARY_PREFIX_COMPRESSED = 2; - - /** Standard storage for sorted set values with 1 level of indirection: - * {@code docId -> address -> ord}. */ - static final int SORTED_WITH_ADDRESSES = 0; - /** Single-valued sorted set values, encoded as sorted values, so no level - * of indirection: {@code docId -> ord}. */ - static final int SORTED_SINGLE_VALUED = 1; - /** Compressed giving IDs to unique sets of values: - * {@code docId -> setId -> ords} */ - static final int SORTED_SET_TABLE = 2; - - /** placeholder for missing offset that means there are no missing values */ - static final int ALL_LIVE = -1; - /** placeholder for missing offset that means all values are missing */ - static final int ALL_MISSING = -2; - // addressing uses 16k blocks static final int MONOTONIC_BLOCK_SIZE = 16384; static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; + + static final int TERMS_DICT_BLOCK_SHIFT = 4; + static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT; + static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1; + + static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10; + static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT; + static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java index e806ea5260d..755da79ac86 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java @@ -18,84 +18,53 @@ package org.apache.lucene.codecs.lucene70; import java.io.Closeable; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Map; -import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.lucene70.Lucene70DocValuesConsumer.NumberType; -import org.apache.lucene.index.*; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.Accountables; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LongValues; -import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.DirectMonotonicReader; import org.apache.lucene.util.packed.DirectReader; -import org.apache.lucene.util.packed.MonotonicBlockPackedReader; - -import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.*; /** reader for {@link Lucene70DocValuesFormat} */ final class Lucene70DocValuesProducer extends DocValuesProducer implements Closeable { private final Map numerics = new HashMap<>(); private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); private final Map sortedSets = new HashMap<>(); - private final Map sortedNumerics = new HashMap<>(); - private final Map ords = new HashMap<>(); - private final Map ordIndexes = new HashMap<>(); - private final int numFields; - private final AtomicLong ramBytesUsed; + private final Map sortedNumerics = new HashMap<>(); + private long ramBytesUsed; private final IndexInput data; private final int maxDoc; - // memory-resident structures - private final Map addressInstances = new HashMap<>(); - private final Map reverseIndexInstances = new HashMap<>(); - private final Map directAddressesMeta = new HashMap<>(); - - private final boolean merging; - - // clone for merge: when merging we don't do any instances.put()s - Lucene70DocValuesProducer(Lucene70DocValuesProducer original) throws IOException { - assert Thread.holdsLock(original); - numerics.putAll(original.numerics); - binaries.putAll(original.binaries); - sortedSets.putAll(original.sortedSets); - sortedNumerics.putAll(original.sortedNumerics); - ords.putAll(original.ords); - ordIndexes.putAll(original.ordIndexes); - numFields = original.numFields; - ramBytesUsed = new AtomicLong(original.ramBytesUsed.get()); - data = original.data.clone(); - maxDoc = original.maxDoc; - - addressInstances.putAll(original.addressInstances); - reverseIndexInstances.putAll(original.reverseIndexInstances); - merging = true; - } - /** expert: instantiates a new reader */ Lucene70DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); this.maxDoc = state.segmentInfo.maxDoc(); - merging = false; - ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass())); + ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass()); int version = -1; - int numFields = -1; // read in the entries from the metadata file. try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { @@ -106,7 +75,7 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close Lucene70DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - numFields = readFields(in, state.fieldInfos); + readFields(in, state.fieldInfos); } catch (Throwable exception) { priorE = exception; } finally { @@ -114,7 +83,6 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close } } - this.numFields = numFields; String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); this.data = state.directory.openInput(dataName, state.context); boolean success = false; @@ -142,1277 +110,405 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close } } - private void readSortedField(FieldInfo info, IndexInput meta) throws IOException { - // sorted = binary + numeric - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.BINARY) { - throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); - } - BinaryEntry b = readBinaryEntry(info, meta); - binaries.put(info.name, b); - - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); - } - NumericEntry n = readNumericEntry(info, meta); - ords.put(info.name, n); - } - - private void readSortedSetFieldWithAddresses(FieldInfo info, IndexInput meta) throws IOException { - // sortedset = binary + numeric (addresses) + ordIndex - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.BINARY) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - BinaryEntry b = readBinaryEntry(info, meta); - binaries.put(info.name, b); - - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - NumericEntry n1 = readNumericEntry(info, meta); - ords.put(info.name, n1); - - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - NumericEntry n2 = readNumericEntry(info, meta); - ordIndexes.put(info.name, n2); - } - - private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException { - // sortedset table = binary + ordset table + ordset index - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.BINARY) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - - BinaryEntry b = readBinaryEntry(info, meta); - binaries.put(info.name, b); - - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - NumericEntry n = readNumericEntry(info, meta); - ords.put(info.name, n); - } - - private int readFields(IndexInput meta, FieldInfos infos) throws IOException { - int numFields = 0; - int fieldNumber = meta.readVInt(); - while (fieldNumber != -1) { - numFields++; + private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { FieldInfo info = infos.fieldInfo(fieldNumber); if (info == null) { - // trickier to validate more: because we use multiple entries for "composite" types like sortedset, etc. throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } byte type = meta.readByte(); if (type == Lucene70DocValuesFormat.NUMERIC) { - numerics.put(info.name, readNumericEntry(info, meta)); + numerics.put(info.name, readNumeric(meta)); } else if (type == Lucene70DocValuesFormat.BINARY) { - BinaryEntry b = readBinaryEntry(info, meta); - binaries.put(info.name, b); + binaries.put(info.name, readBinary(meta)); } else if (type == Lucene70DocValuesFormat.SORTED) { - readSortedField(info, meta); + sorted.put(info.name, readSorted(meta)); } else if (type == Lucene70DocValuesFormat.SORTED_SET) { - SortedSetEntry ss = readSortedSetEntry(meta); - sortedSets.put(info.name, ss); - if (ss.format == SORTED_WITH_ADDRESSES) { - readSortedSetFieldWithAddresses(info, meta); - } else if (ss.format == SORTED_SET_TABLE) { - readSortedSetFieldWithTable(info, meta); - } else if (ss.format == SORTED_SINGLE_VALUED) { - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.SORTED) { - throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); - } - readSortedField(info, meta); - } else { - throw new AssertionError(); - } + sortedSets.put(info.name, readSortedSet(meta)); } else if (type == Lucene70DocValuesFormat.SORTED_NUMERIC) { - SortedSetEntry ss = readSortedSetEntry(meta); - sortedNumerics.put(info.name, ss); - if (ss.format == SORTED_WITH_ADDRESSES) { - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - numerics.put(info.name, readNumericEntry(info, meta)); - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - NumericEntry ordIndex = readNumericEntry(info, meta); - ordIndexes.put(info.name, ordIndex); - } else if (ss.format == SORTED_SET_TABLE) { - if (meta.readVInt() != info.number) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - NumericEntry n = readNumericEntry(info, meta); - ords.put(info.name, n); - } else if (ss.format == SORTED_SINGLE_VALUED) { - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene70DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - numerics.put(info.name, readNumericEntry(info, meta)); - } else { - throw new AssertionError(); - } + sortedNumerics.put(info.name, readSortedNumeric(meta)); } else { throw new CorruptIndexException("invalid type: " + type, meta); } - fieldNumber = meta.readVInt(); } - return numFields; } - private NumericEntry readNumericEntry(FieldInfo info, IndexInput meta) throws IOException { + private NumericEntry readNumeric(ChecksumIndexInput meta) throws IOException { NumericEntry entry = new NumericEntry(); - entry.format = meta.readVInt(); - entry.missingOffset = meta.readLong(); - if (entry.format == SPARSE_COMPRESSED) { - // sparse bits need a bit more metadata - entry.numDocsWithValue = meta.readVLong(); - final int blockShift = meta.readVInt(); - entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue, blockShift); - ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed()); - directAddressesMeta.put(info.name, entry.monotonicMeta); - } - entry.offset = meta.readLong(); - entry.count = meta.readVLong(); - switch(entry.format) { - case CONST_COMPRESSED: - entry.minValue = meta.readLong(); - if (entry.count > Integer.MAX_VALUE) { - // currently just a limitation e.g. of bits interface and so on. - throw new CorruptIndexException("illegal CONST_COMPRESSED count: " + entry.count, meta); - } - break; - case GCD_COMPRESSED: - entry.minValue = meta.readLong(); - entry.gcd = meta.readLong(); - entry.bitsPerValue = meta.readVInt(); - break; - case TABLE_COMPRESSED: - final int uniqueValues = meta.readVInt(); - if (uniqueValues > 256) { - throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, got=" + uniqueValues, meta); - } - entry.table = new long[uniqueValues]; - for (int i = 0; i < uniqueValues; ++i) { - entry.table[i] = meta.readLong(); - } - ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.table)); - entry.bitsPerValue = meta.readVInt(); - break; - case DELTA_COMPRESSED: - entry.minValue = meta.readLong(); - entry.bitsPerValue = meta.readVInt(); - break; - case MONOTONIC_COMPRESSED: - final int blockShift = meta.readVInt(); - entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, maxDoc + 1, blockShift); - ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed()); - directAddressesMeta.put(info.name, entry.monotonicMeta); - break; - case SPARSE_COMPRESSED: - final byte numberType = meta.readByte(); - switch (numberType) { - case 0: - entry.numberType = NumberType.VALUE; - break; - case 1: - entry.numberType = NumberType.ORDINAL; - break; - default: - throw new CorruptIndexException("Number type can only be 0 or 1, got=" + numberType, meta); - } - - // now read the numeric entry for non-missing values - final int fieldNumber = meta.readVInt(); - if (fieldNumber != info.number) { - throw new CorruptIndexException("Field numbers mistmatch: " + fieldNumber + " != " + info.number, meta); - } - final int dvFormat = meta.readByte(); - if (dvFormat != NUMERIC) { - throw new CorruptIndexException("Formats mistmatch: " + dvFormat + " != " + NUMERIC, meta); - } - entry.nonMissingValues = readNumericEntry(info, meta); - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta); - } - entry.endOffset = meta.readLong(); + readNumeric(meta, entry); return entry; } - private BinaryEntry readBinaryEntry(FieldInfo info, IndexInput meta) throws IOException { - BinaryEntry entry = new BinaryEntry(); - entry.format = meta.readVInt(); - entry.missingOffset = meta.readLong(); - entry.minLength = meta.readVInt(); - entry.maxLength = meta.readVInt(); - entry.count = meta.readVLong(); - entry.offset = meta.readLong(); - switch(entry.format) { - case BINARY_FIXED_UNCOMPRESSED: - break; - case BINARY_PREFIX_COMPRESSED: - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - entry.reverseIndexOffset = meta.readLong(); - break; - case BINARY_VARIABLE_UNCOMPRESSED: - entry.addressesOffset = meta.readLong(); - final int blockShift = meta.readVInt(); - entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, entry.count + 1, blockShift); - ramBytesUsed.addAndGet(entry.addressesMeta.ramBytesUsed()); - directAddressesMeta.put(info.name, entry.addressesMeta); - entry.addressesEndOffset = meta.readLong(); - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format, meta); + private void readNumeric(ChecksumIndexInput meta, NumericEntry entry) throws IOException { + entry.docsWithFieldOffset = meta.readLong(); + entry.docsWithFieldLength = meta.readLong(); + entry.numValues = meta.readLong(); + int tableSize = meta.readInt(); + if (tableSize < -1 || tableSize > 256) { + throw new CorruptIndexException("invalid table size: " + tableSize, meta); } - return entry; - } - - SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException { - SortedSetEntry entry = new SortedSetEntry(); - entry.format = meta.readVInt(); - if (entry.format == SORTED_SET_TABLE) { - final int totalTableLength = meta.readInt(); - if (totalTableLength > 256) { - throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta); - } - entry.table = new long[totalTableLength]; - for (int i = 0; i < totalTableLength; ++i) { + if (tableSize >= 0) { + entry.table = new long[tableSize]; + ramBytesUsed += RamUsageEstimator.sizeOf(entry.table); + for (int i = 0; i < tableSize; ++i) { entry.table[i] = meta.readLong(); } - ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.table)); - final int tableSize = meta.readInt(); - if (tableSize > totalTableLength + 1) { // +1 because of the empty set - throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta); - } - entry.tableOffsets = new int[tableSize + 1]; - for (int i = 1; i < entry.tableOffsets.length; ++i) { - entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt(); - } - ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.tableOffsets)); - } else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) { - throw new CorruptIndexException("Unknown format: " + entry.format, meta); + } + entry.bitsPerValue = meta.readByte(); + entry.minValue = meta.readLong(); + entry.gcd = meta.readLong(); + entry.valuesOffset = meta.readLong(); + entry.valuesLength = meta.readLong(); + } + + private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException { + BinaryEntry entry = new BinaryEntry(); + entry.dataOffset = meta.readLong(); + entry.dataLength = meta.readLong(); + entry.docsWithFieldOffset = meta.readLong(); + entry.docsWithFieldLength = meta.readLong(); + entry.numDocsWithField = meta.readInt(); + entry.minLength = meta.readInt(); + entry.maxLength = meta.readInt(); + if (entry.minLength < entry.maxLength) { + entry.addressesOffset = meta.readLong(); + final int blockShift = meta.readVInt(); + entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithField + 1L, blockShift); + ramBytesUsed += entry.addressesMeta.ramBytesUsed(); + entry.addressesLength = meta.readLong(); } return entry; } + private SortedEntry readSorted(ChecksumIndexInput meta) throws IOException { + SortedEntry entry = new SortedEntry(); + entry.docsWithFieldOffset = meta.readLong(); + entry.docsWithFieldLength = meta.readLong(); + entry.numDocsWithField = meta.readInt(); + entry.bitsPerValue = meta.readByte(); + entry.ordsOffset = meta.readLong(); + entry.ordsLength = meta.readLong(); + readTermDict(meta, entry); + return entry; + } + + private SortedSetEntry readSortedSet(ChecksumIndexInput meta) throws IOException { + SortedSetEntry entry = new SortedSetEntry(); + byte multiValued = meta.readByte(); + switch (multiValued) { + case 0: // singlevalued + entry.singleValueEntry = readSorted(meta); + return entry; + case 1: // multivalued + break; + default: + throw new CorruptIndexException("Invalid multiValued flag: " + multiValued, meta); + } + entry.docsWithFieldOffset = meta.readLong(); + entry.docsWithFieldLength = meta.readLong(); + entry.bitsPerValue = meta.readByte(); + entry.ordsOffset = meta.readLong(); + entry.ordsLength = meta.readLong(); + entry.numDocsWithField = meta.readInt(); + entry.addressesOffset = meta.readLong(); + final int blockShift = meta.readVInt(); + entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithField + 1, blockShift); + ramBytesUsed += entry.addressesMeta.ramBytesUsed(); + entry.addressesLength = meta.readLong(); + readTermDict(meta, entry); + return entry; + } + + private static void readTermDict(ChecksumIndexInput meta, TermsDictEntry entry) throws IOException { + entry.termsDictSize = meta.readVLong(); + entry.termsDictBlockShift = meta.readInt(); + final int blockShift = meta.readInt(); + final long addressesSize = (entry.termsDictSize + (1L << entry.termsDictBlockShift) - 1) >>> entry.termsDictBlockShift; + entry.termsAddressesMeta = DirectMonotonicReader.loadMeta(meta, addressesSize, blockShift); + entry.maxTermLength = meta.readInt(); + entry.termsDataOffset = meta.readLong(); + entry.termsDataLength = meta.readLong(); + entry.termsAddressesOffset = meta.readLong(); + entry.termsAddressesLength = meta.readLong(); + entry.termsDictIndexShift = meta.readInt(); + final long indexSize = (entry.termsDictSize + (1L << entry.termsDictIndexShift) - 1) >>> entry.termsDictIndexShift; + entry.termsIndexAddressesMeta = DirectMonotonicReader.loadMeta(meta, 1 + indexSize, blockShift); + entry.termsIndexOffset = meta.readLong(); + entry.termsIndexLength = meta.readLong(); + entry.termsIndexAddressesOffset = meta.readLong(); + entry.termsIndexAddressesLength = meta.readLong(); + } + + private SortedNumericEntry readSortedNumeric(ChecksumIndexInput meta) throws IOException { + SortedNumericEntry entry = new SortedNumericEntry(); + readNumeric(meta, entry); + entry.numDocsWithField = meta.readInt(); + if (entry.numDocsWithField != entry.numValues) { + entry.addressesOffset = meta.readLong(); + final int blockShift = meta.readVInt(); + entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithField + 1, blockShift); + ramBytesUsed += entry.addressesMeta.ramBytesUsed(); + entry.addressesLength = meta.readLong(); + } + return entry; + } + + @Override + public void close() throws IOException { + data.close(); + } + + private static class NumericEntry { + long[] table; + byte bitsPerValue; + long docsWithFieldOffset; + long docsWithFieldLength; + long numValues; + long minValue; + long gcd; + long valuesOffset; + long valuesLength; + } + + private static class BinaryEntry { + long dataOffset; + long dataLength; + long docsWithFieldOffset; + long docsWithFieldLength; + int numDocsWithField; + int minLength; + int maxLength; + long addressesOffset; + long addressesLength; + DirectMonotonicReader.Meta addressesMeta; + } + + private static class TermsDictEntry { + long termsDictSize; + int termsDictBlockShift; + DirectMonotonicReader.Meta termsAddressesMeta; + int maxTermLength; + long termsDataOffset; + long termsDataLength; + long termsAddressesOffset; + long termsAddressesLength; + int termsDictIndexShift; + DirectMonotonicReader.Meta termsIndexAddressesMeta; + long termsIndexOffset; + long termsIndexLength; + long termsIndexAddressesOffset; + long termsIndexAddressesLength; + } + + private static class SortedEntry extends TermsDictEntry { + long docsWithFieldOffset; + long docsWithFieldLength; + int numDocsWithField; + byte bitsPerValue; + long ordsOffset; + long ordsLength; + } + + private static class SortedSetEntry extends TermsDictEntry { + SortedEntry singleValueEntry; + long docsWithFieldOffset; + long docsWithFieldLength; + int numDocsWithField; + byte bitsPerValue; + long ordsOffset; + long ordsLength; + DirectMonotonicReader.Meta addressesMeta; + long addressesOffset; + long addressesLength; + } + + private static class SortedNumericEntry extends NumericEntry { + int numDocsWithField; + DirectMonotonicReader.Meta addressesMeta; + long addressesOffset; + long addressesLength; + } + + @Override + public long ramBytesUsed() { + return ramBytesUsed; + } + @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { NumericEntry entry = numerics.get(field.name); - DocIdSetIterator docsWithField; + return getNumeric(entry); + } - if (entry.format == SPARSE_COMPRESSED) { - return getSparseNumericDocValues(entry); + private NumericDocValues getNumeric(NumericEntry entry) throws IOException { + if (entry.docsWithFieldOffset == -2) { + // empty + return DocValues.emptyNumeric(); + } else if (entry.docsWithFieldOffset == -1) { + // dense + final LongValues normValues = getNumericValues(entry); + return new NumericDocValues() { + + int doc = -1; + + @Override + public long longValue() throws IOException { + return normValues.get(doc); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public long cost() { + return maxDoc; + } + + }; } else { - if (entry.missingOffset == ALL_MISSING) { - return DocValues.emptyNumeric(); - } else if (entry.missingOffset == ALL_LIVE) { - LongValues values = getNumeric(entry); - return new NumericDocValues() { - private int docID = -1; + // sparse + final LongValues values = getNumericValues(entry); + final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numValues); + return new NumericDocValues() { - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() { - docID++; - if (docID == maxDoc) { - docID = NO_MORE_DOCS; - } - return docID; - } - - @Override - public int advance(int target) { - if (target >= maxDoc) { - docID = NO_MORE_DOCS; - } else { - docID = target; - } - return docID; - } - - @Override - public long cost() { - return entry.count; - } - - @Override - public long longValue() { - return values.get(docID); - } - }; - } else { - docsWithField = getDocsWithField(entry.missingOffset, maxDoc); - LongValues values = getNumeric(entry); - return new NumericDocValues() { - - @Override - public long longValue() { - return values.get(docsWithField.docID()); - } - - @Override - public int docID() { - return docsWithField.docID(); - } - - @Override - public int nextDoc() throws IOException { - return docsWithField.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return docsWithField.advance(target); - } - - @Override - public long cost() { - return docsWithField.cost(); - } - - }; - } - } - } - - @Override - public long ramBytesUsed() { - return ramBytesUsed.get(); - } - - @Override - public synchronized Collection getChildResources() { - List resources = new ArrayList<>(); - resources.addAll(Accountables.namedAccountables("addresses field", addressInstances)); - resources.addAll(Accountables.namedAccountables("reverse index field", reverseIndexInstances)); - resources.addAll(Accountables.namedAccountables("direct addresses meta field", directAddressesMeta)); - return Collections.unmodifiableList(resources); - } - - @Override - public void checkIntegrity() throws IOException { - CodecUtil.checksumEntireFile(data); - } - - @Override - public String toString() { - return getClass().getSimpleName() + "(fields=" + numFields + ")"; - } - - LongValues getNumeric(NumericEntry entry) throws IOException { - switch (entry.format) { - case CONST_COMPRESSED: { - final long constant = entry.minValue; - final Bits live = getLiveBits(entry.missingOffset, (int)entry.count); - return new LongValues() { - @Override - public long get(long index) { - return live.get((int)index) ? constant : 0; - } - }; - } - case DELTA_COMPRESSED: { - RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); - final long delta = entry.minValue; - final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue, 0); - return new LongValues() { - @Override - public long get(long id) { - return delta + values.get(id); - } - }; - } - case GCD_COMPRESSED: { - RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); - final long min = entry.minValue; - final long mult = entry.gcd; - final LongValues quotientReader = DirectReader.getInstance(slice, entry.bitsPerValue, 0); - return new LongValues() { - @Override - public long get(long id) { - return min + mult * quotientReader.get(id); - } - }; - } - case TABLE_COMPRESSED: { - RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); - final long table[] = entry.table; - final LongValues ords = DirectReader.getInstance(slice, entry.bitsPerValue, 0); - return new LongValues() { - @Override - public long get(long id) { - return table[(int) ords.get(id)]; - } - }; - } - case SPARSE_COMPRESSED: - final SparseNumericDocValues values = getSparseNumericDocValues(entry); - final long missingValue; - switch (entry.numberType) { - case ORDINAL: - missingValue = -1L; - break; - case VALUE: - missingValue = 0L; - break; - default: - throw new AssertionError(); + @Override + public int advance(int target) throws IOException { + return disi.advance(target); } - return new SparseNumericDocValuesRandomAccessWrapper(values, missingValue); - default: - throw new AssertionError(); + + @Override + public int nextDoc() throws IOException { + return disi.nextDoc(); + } + + @Override + public int docID() { + return disi.docID(); + } + + @Override + public long cost() { + return disi.cost(); + } + + @Override + public long longValue() throws IOException { + return values.get(disi.index()); + } + }; } } - static final class SparseNumericDocValues extends NumericDocValues { - - final int docIDsLength; - final LongValues docIds, values; - - int index, doc; - - SparseNumericDocValues(int docIDsLength, LongValues docIDs, LongValues values) { - this.docIDsLength = docIDsLength; - this.docIds = docIDs; - this.values = values; - reset(); - } - - void reset() { - index = -1; - doc = -1; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - if (index >= docIDsLength - 1) { - index = docIDsLength; - return doc = NO_MORE_DOCS; + private LongValues getNumericValues(NumericEntry entry) throws IOException { + if (entry.bitsPerValue == 0) { + return new LongValues() { + @Override + public long get(long index) { + return entry.minValue; + } + }; + } else { + final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength); + LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue); + if (entry.gcd != 1) { + values = applyGcd(values, entry.gcd); } - return doc = (int) docIds.get(++index); - } - - @Override - public int advance(int target) throws IOException { - long loIndex = index; - long step = 1; - long hiIndex; - int hiDoc; - - // gallop forward by exponentially growing the interval - // in order to find an interval so that the target doc - // is in ]lo, hi]. Compared to a regular binary search, - // this optimizes the case that the caller performs many - // advance calls by small deltas - do { - hiIndex = index + step; - if (hiIndex >= docIDsLength) { - hiIndex = docIDsLength; - hiDoc = NO_MORE_DOCS; - break; - } - hiDoc = (int) docIds.get(hiIndex); - if (hiDoc >= target) { - break; - } - step <<= 1; - } while (true); - - // now binary search - while (loIndex + 1 < hiIndex) { - final long midIndex = (loIndex + 1 + hiIndex) >>> 1; - final int midDoc = (int) docIds.get(midIndex); - if (midDoc >= target) { - hiIndex = midIndex; - hiDoc = midDoc; - } else { - loIndex = midIndex; - } + if (entry.minValue != 0) { + values = applyDelta(values, entry.minValue); } - - index = (int) hiIndex; - return doc = hiDoc; - } - - @Override - public long longValue() { - assert index >= 0; - assert index < docIDsLength; - return values.get(index); - } - - @Override - public long cost() { - return docIDsLength; + if (entry.table != null) { + values = applyTable(values, entry.table); + } + return values; } } - static class SparseNumericDocValuesRandomAccessWrapper extends LongValues { - - final SparseNumericDocValues values; - final long missingValue; - - SparseNumericDocValuesRandomAccessWrapper(SparseNumericDocValues values, long missingValue) { - this.values = values; - this.missingValue = missingValue; - } - - @Override - public long get(long longIndex) { - final int index = Math.toIntExact(longIndex); - int doc = values.docID(); - if (doc >= index) { - values.reset(); + private LongValues applyDelta(LongValues values, long delta) { + return new LongValues() { + @Override + public long get(long index) { + return delta + values.get(index); } - assert values.docID() < index; - try { - doc = values.advance(index); - } catch (IOException e) { - throw new RuntimeException(e); - } - if (doc == index) { - return values.longValue(); - } else { - return missingValue; - } - } - + }; } - LegacyBinaryDocValues getLegacyBinary(FieldInfo field) throws IOException { - BinaryEntry bytes = binaries.get(field.name); - switch(bytes.format) { - case BINARY_FIXED_UNCOMPRESSED: - return getFixedBinary(field, bytes); - case BINARY_VARIABLE_UNCOMPRESSED: - return getVariableBinary(field, bytes); - case BINARY_PREFIX_COMPRESSED: - return getCompressedBinary(field, bytes); - default: - throw new AssertionError(); - } + private LongValues applyGcd(LongValues values, long gcd) { + return new LongValues() { + @Override + public long get(long index) { + return values.get(index) * gcd; + } + }; + } + + private LongValues applyTable(LongValues values, long[] table) { + return new LongValues() { + @Override + public long get(long index) { + return table[(int) values.get(index)]; + } + }; } @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry be = binaries.get(field.name); - DocIdSetIterator docsWithField = getDocsWithField(be.missingOffset, maxDoc); - LegacyBinaryDocValues values = getLegacyBinary(field); - return new BinaryDocValues() { - - @Override - public int nextDoc() throws IOException { - return docsWithField.nextDoc(); - } - - @Override - public int docID() { - return docsWithField.docID(); - } - - @Override - public long cost() { - return docsWithField.cost(); - } - - @Override - public int advance(int target) throws IOException { - return docsWithField.advance(target); - } - - @Override - public BytesRef binaryValue() { - return values.get(docsWithField.docID()); - } - }; - } - - private LegacyBinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.slice("fixed-binary", bytes.offset, bytes.count * bytes.maxLength); - - final BytesRef term = new BytesRef(bytes.maxLength); - final byte[] buffer = term.bytes; - final int length = term.length = bytes.maxLength; - - return new LongBinaryDocValues() { - @Override - public BytesRef get(long id) { - try { - data.seek(id * length); - data.readBytes(buffer, 0, buffer.length); - return term; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - private LegacyBinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final RandomAccessInput addressesData = this.data.randomAccessSlice(bytes.addressesOffset, bytes.addressesEndOffset - bytes.addressesOffset); - final LongValues addresses = DirectMonotonicReader.getInstance(bytes.addressesMeta, addressesData); - - final IndexInput data = this.data.slice("var-binary", bytes.offset, bytes.addressesOffset - bytes.offset); - final BytesRef term = new BytesRef(Math.max(0, bytes.maxLength)); - final byte buffer[] = term.bytes; - - return new LongBinaryDocValues() { - @Override - public BytesRef get(long id) { - long startAddress = addresses.get(id); - long endAddress = addresses.get(id+1); - int length = (int) (endAddress - startAddress); - try { - data.seek(startAddress); - data.readBytes(buffer, 0, length); - term.length = length; - return term; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - /** returns an address instance for prefix-compressed binary values. */ - private synchronized MonotonicBlockPackedReader getIntervalInstance(FieldInfo field, BinaryEntry bytes) throws IOException { - MonotonicBlockPackedReader addresses = addressInstances.get(field.name); - if (addresses == null) { - data.seek(bytes.addressesOffset); - final long size = (bytes.count + INTERVAL_MASK) >>> INTERVAL_SHIFT; - addresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false); - if (!merging) { - addressInstances.put(field.name, addresses); - ramBytesUsed.addAndGet(addresses.ramBytesUsed() + Integer.BYTES); - } + BinaryEntry entry = binaries.get(field.name); + if (entry.docsWithFieldOffset == -2) { + return DocValues.emptyBinary(); } - return addresses; - } - - /** returns a reverse lookup instance for prefix-compressed binary values. */ - private synchronized ReverseTermsIndex getReverseIndexInstance(FieldInfo field, BinaryEntry bytes) throws IOException { - ReverseTermsIndex index = reverseIndexInstances.get(field.name); - if (index == null) { - index = new ReverseTermsIndex(); - data.seek(bytes.reverseIndexOffset); - long size = (bytes.count + REVERSE_INTERVAL_MASK) >>> REVERSE_INTERVAL_SHIFT; - index.termAddresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false); - long dataSize = data.readVLong(); - PagedBytes pagedBytes = new PagedBytes(15); - pagedBytes.copy(data, dataSize); - index.terms = pagedBytes.freeze(true); - if (!merging) { - reverseIndexInstances.put(field.name, index); - ramBytesUsed.addAndGet(index.ramBytesUsed()); - } - } - return index; - } - - private LegacyBinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final MonotonicBlockPackedReader addresses = getIntervalInstance(field, bytes); - final ReverseTermsIndex index = getReverseIndexInstance(field, bytes); - assert addresses.size() > 0; // we don't have to handle empty case - IndexInput slice = data.slice("terms", bytes.offset, bytes.addressesOffset - bytes.offset); - return new CompressedBinaryDocValues(bytes, addresses, index, slice); - } - - @Override - public SortedDocValues getSorted(FieldInfo field) throws IOException { - final int valueCount = (int) binaries.get(field.name).count; - final LegacyBinaryDocValues binary = getLegacyBinary(field); - NumericEntry entry = ords.get(field.name); - final LongValues ordinals = getNumeric(entry); - if (entry.format == SPARSE_COMPRESSED) { - final SparseNumericDocValues sparseValues = ((SparseNumericDocValuesRandomAccessWrapper) ordinals).values; - return new SortedDocValues() { + IndexInput bytesSlice = data.slice("fixed-binary", entry.dataOffset, entry.dataLength); + BytesRefs bytesRefs; + if (entry.minLength == entry.maxLength) { + bytesRefs = new BytesRefs() { + BytesRef bytes = new BytesRef(new byte[entry.maxLength], 0, entry.maxLength); @Override - public int ordValue() { - return (int) sparseValues.longValue(); - } - - @Override - public BytesRef lookupOrd(int ord) { - return binary.get(ord); - } - - @Override - public int getValueCount() { - return valueCount; - } - - @Override - public int docID() { - return sparseValues.docID(); - } - - @Override - public int nextDoc() throws IOException { - return sparseValues.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return sparseValues.advance(target); - } - - @Override - public long cost() { - return sparseValues.cost(); - } - - }; - } - return new SortedDocValues() { - private int docID = -1; - private int ord; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - assert docID != NO_MORE_DOCS; - while (true) { - docID++; - if (docID == maxDoc) { - docID = NO_MORE_DOCS; - break; - } - ord = (int) ordinals.get(docID); - if (ord != -1) { - break; - } - } - return docID; - } - - @Override - public int advance(int target) throws IOException { - if (target >= maxDoc) { - docID = NO_MORE_DOCS; - return docID; - } else { - docID = target-1; - return nextDoc(); - } - } - - @Override - public int ordValue() { - return ord; - } - - @Override - public long cost() { - // TODO - return 0; - } - - @Override - public BytesRef lookupOrd(int ord) { - return binary.get(ord); - } - - @Override - public int getValueCount() { - return valueCount; - } - - @Override - public int lookupTerm(BytesRef key) throws IOException { - if (binary instanceof CompressedBinaryDocValues) { - return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key); - } else { - return super.lookupTerm(key); - } - } - - @Override - public TermsEnum termsEnum() { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).getTermsEnum(); - } else { - return super.termsEnum(); - } - } - }; - } - - /** returns an address instance for sortedset ordinal lists */ - private LongValues getOrdIndexInstance(FieldInfo field, NumericEntry entry) throws IOException { - RandomAccessInput data = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); - return DirectMonotonicReader.getInstance(entry.monotonicMeta, data); - } - - @Override - public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - SortedSetEntry ss = sortedNumerics.get(field.name); - if (ss.format == SORTED_SINGLE_VALUED) { - NumericEntry numericEntry = numerics.get(field.name); - final LongValues values = getNumeric(numericEntry); - if (numericEntry.format == SPARSE_COMPRESSED) { - SparseNumericDocValues sparseValues = ((SparseNumericDocValuesRandomAccessWrapper) values).values; - return new SortedNumericDocValues() { - - @Override - public long nextValue() throws IOException { - return sparseValues.longValue(); - } - - @Override - public int docValueCount() { - return 1; - } - - @Override - public int docID() { - return sparseValues.docID(); - } - - @Override - public int nextDoc() throws IOException { - return sparseValues.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return sparseValues.advance(target); - } - - @Override - public long cost() { - return sparseValues.cost(); - } - - }; - } - final DocIdSetIterator docsWithField = getDocsWithField(numericEntry.missingOffset, maxDoc); - - return new SortedNumericDocValues() { - - @Override - public int docID() { - return docsWithField.docID(); - } - - @Override - public int nextDoc() throws IOException { - return docsWithField.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return docsWithField.advance(target); - } - - @Override - public long cost() { - return docsWithField.cost(); - } - - @Override - public int docValueCount() { - return 1; - } - - @Override - public long nextValue() { - return values.get(docsWithField.docID()); - } - }; - } else if (ss.format == SORTED_WITH_ADDRESSES) { - NumericEntry numericEntry = numerics.get(field.name); - final LongValues values = getNumeric(numericEntry); - final LongValues ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); - - return new SortedNumericDocValues() { - long startOffset; - long endOffset; - int docID = -1; - long upto; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() { - while (true) { - docID++; - if (docID == maxDoc) { - docID = NO_MORE_DOCS; - return docID; - } - startOffset = ordIndex.get(docID); - endOffset = ordIndex.get(docID+1L); - if (endOffset > startOffset) { - break; - } - } - upto = startOffset; - return docID; - } - - @Override - public int advance(int target) { - if (target >= maxDoc) { - docID = NO_MORE_DOCS; - return docID; - } else { - docID = target-1; - return nextDoc(); - } - } - - @Override - public long cost() { - // TODO - return 0; - } - - @Override - public int docValueCount() { - return (int) (endOffset - startOffset); - } - - @Override - public long nextValue() { - return values.get(upto++); - } - }; - } else if (ss.format == SORTED_SET_TABLE) { - NumericEntry entry = ords.get(field.name); - final LongValues ordinals = getNumeric(entry); - - final long[] table = ss.table; - final int[] offsets = ss.tableOffsets; - return new SortedNumericDocValues() { - int startOffset; - int endOffset; - int docID = -1; - int upto; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() { - while (true) { - docID++; - if (docID == maxDoc) { - docID = NO_MORE_DOCS; - return docID; - } - int ord = (int) ordinals.get(docID); - startOffset = offsets[ord]; - endOffset = offsets[ord+1]; - if (endOffset > startOffset) { - break; - } - } - upto = startOffset; - return docID; - } - - @Override - public int advance(int target) { - if (target >= maxDoc) { - docID = NO_MORE_DOCS; - return docID; - } else { - docID = target-1; - return nextDoc(); - } - } - - @Override - public long cost() { - // TODO - return 0; - } - - @Override - public int docValueCount() { - return endOffset - startOffset; - } - - @Override - public long nextValue() { - return table[upto++]; + public BytesRef get(int index) throws IOException { + bytesSlice.seek((long) index * bytes.length); + bytesSlice.readBytes(bytes.bytes, 0, bytes.length); + return bytes; } }; } else { - throw new AssertionError(); - } - } - - @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - SortedSetEntry ss = sortedSets.get(field.name); - switch (ss.format) { - case SORTED_SINGLE_VALUED: - return DocValues.singleton(getSorted(field)); - case SORTED_WITH_ADDRESSES: - return getSortedSetWithAddresses(field); - case SORTED_SET_TABLE: - return getSortedSetTable(field, ss); - default: - throw new AssertionError(); - } - } - - private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException { - final long valueCount = binaries.get(field.name).count; - // we keep the byte[]s and list of ords on disk, these could be large - final LongBinaryDocValues binary = (LongBinaryDocValues) getLegacyBinary(field); - final LongValues ordinals = getNumeric(ords.get(field.name)); - // but the addresses to the ord stream are in RAM - final LongValues ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); - - return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() { - long startOffset; - long offset; - long endOffset; - - @Override - public long nextOrd() { - if (offset == endOffset) { - return NO_MORE_ORDS; - } else { - long ord = ordinals.get(offset); - offset++; - return ord; - } - } - - @Override - public void setDocument(int docID) { - startOffset = offset = ordIndex.get(docID); - endOffset = ordIndex.get(docID+1L); - } - - @Override - public BytesRef lookupOrd(long ord) { - return binary.get(ord); - } - - @Override - public long getValueCount() { - return valueCount; - } - - @Override - public long lookupTerm(BytesRef key) { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).lookupTerm(key); - } else { - return super.lookupTerm(key); - } - } - - @Override - public TermsEnum termsEnum() { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).getTermsEnum(); - } else { - return super.termsEnum(); - } - } - }, maxDoc); - } - - private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException { - final long valueCount = binaries.get(field.name).count; - final LongBinaryDocValues binary = (LongBinaryDocValues) getLegacyBinary(field); - final NumericEntry ordinalsEntry = ords.get(field.name); - final LongValues ordinals = getNumeric(ordinalsEntry); - - final long[] table = ss.table; - final int[] offsets = ss.tableOffsets; - - return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() { - - int offset, startOffset, endOffset; - - @Override - public void setDocument(int docID) { - final int ord = (int) ordinals.get(docID); - offset = startOffset = offsets[ord]; - endOffset = offsets[ord + 1]; - } - - @Override - public long nextOrd() { - if (offset == endOffset) { - return NO_MORE_ORDS; - } else { - return table[offset++]; - } - } - - @Override - public BytesRef lookupOrd(long ord) { - return binary.get(ord); - } - - @Override - public long getValueCount() { - return valueCount; - } - - @Override - public long lookupTerm(BytesRef key) { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues) binary).lookupTerm(key); - } else { - return super.lookupTerm(key); - } - } - - @Override - public TermsEnum termsEnum() { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues) binary).getTermsEnum(); - } else { - return super.termsEnum(); - } - } - }, maxDoc); - } - - private Bits getLiveBits(final long offset, final int count) throws IOException { - if (offset == ALL_MISSING) { - return new Bits.MatchNoBits(count); - } else if (offset == ALL_LIVE) { - return new Bits.MatchAllBits(count); - } else { - int length = (int) ((count + 63L) >>> 6); - final RandomAccessInput in = data.randomAccessSlice(offset, length << 3); - return new Bits() { - - int wordIndex = -1; - long word = 0; - + final RandomAccessInput addressesData = this.data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); + final LongValues addresses = DirectMonotonicReader.getInstance(entry.addressesMeta, addressesData); + bytesRefs = new BytesRefs() { + BytesRef bytes = new BytesRef(entry.maxLength); @Override - public boolean get(int index) { - try { - int i = index >>> 6; - if (wordIndex != i) { - wordIndex = i; - word = in.readLong(i << 3); - } - return (word & (1L << index)) != 0; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public int length() { - return count; + BytesRef get(int index) throws IOException { + long startOffset = addresses.get(index); + bytes.length = (int) (addresses.get(index + 1L) - startOffset); + bytesSlice.seek(startOffset); + bytesSlice.readBytes(bytes.bytes, 0, bytes.length); + return bytes; } }; } - } - private DocIdSetIterator getDocsWithField(final long offset, final int count) throws IOException { - if (offset == ALL_MISSING) { - return DocIdSetIterator.empty(); - } else if (offset == ALL_LIVE) { - return DocIdSetIterator.all(count); - } else { - int length = (int) ((count + 63L) >>> 6); - final RandomAccessInput in = data.randomAccessSlice(offset, length << 3); - return new DocIdSetIterator() { + if (entry.docsWithFieldOffset == -1) { + // dense + return new BinaryDocValues() { int doc = -1; - int wordIndex = -1; - long word = 0; - - private int nextSetBit(int index) throws IOException { - int i = index >>> 6; - - if (wordIndex != i) { - wordIndex = i; - word = in.readLong(i << 3); - } - long w = word >>> index; - if (w != 0) { - return index + Long.numberOfTrailingZeros(w); - } - - while (++i < length) { - wordIndex = i; - word = in.readLong(i << 3); - if (word != 0) { - return (i << 6) + Long.numberOfTrailingZeros(word); - } - } - - return DocIdSetIterator.NO_MORE_DOCS; - } @Override public int nextDoc() throws IOException { - return advance(docID() + 1); + return advance(doc + 1); } @Override @@ -1422,395 +518,632 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close @Override public long cost() { - // TODO: what is the exact cardinality - return count; + return maxDoc; } @Override public int advance(int target) throws IOException { - if (target >= count) { + if (target >= maxDoc) { return doc = NO_MORE_DOCS; } - return doc = nextSetBit(target); + return doc = target; + } + + @Override + public BytesRef binaryValue() throws IOException { + return bytesRefs.get(doc); + } + }; + } else { + // sparse + final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField); + return new BinaryDocValues() { + + @Override + public int nextDoc() throws IOException { + return disi.nextDoc(); + } + + @Override + public int docID() { + return disi.docID(); + } + + @Override + public long cost() { + return disi.cost(); + } + + @Override + public int advance(int target) throws IOException { + return disi.advance(target); + } + + @Override + public BytesRef binaryValue() throws IOException { + return bytesRefs.get(disi.index()); } }; } } - private SparseNumericDocValues getSparseNumericDocValues(NumericEntry entry) throws IOException { - final RandomAccessInput docIdsData = this.data.randomAccessSlice(entry.missingOffset, entry.offset - entry.missingOffset); - final LongValues docIDs = DirectMonotonicReader.getInstance(entry.monotonicMeta, docIdsData); - final LongValues values = getNumeric(entry.nonMissingValues); // cannot be sparse - return new SparseNumericDocValues(Math.toIntExact(entry.numDocsWithValue), docIDs, values); + private static abstract class BytesRefs { + abstract BytesRef get(int index) throws IOException; } @Override - public synchronized DocValuesProducer getMergeInstance() throws IOException { - return new Lucene70DocValuesProducer(this); + public SortedDocValues getSorted(FieldInfo field) throws IOException { + SortedEntry entry = sorted.get(field.name); + return getSorted(entry); } - @Override - public void close() throws IOException { - data.close(); - } - - /** metadata entry for a numeric docvalues field */ - static class NumericEntry { - private NumericEntry() {} - /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ - long missingOffset; - /** offset to the actual numeric values */ - public long offset; - /** end offset to the actual numeric values */ - public long endOffset; - /** bits per value used to pack the numeric values */ - public int bitsPerValue; - - int format; - /** count of values written */ - public long count; - - /** monotonic meta */ - public DirectMonotonicReader.Meta monotonicMeta; - - long minValue; - long gcd; - long table[]; - - /** for sparse compression */ - long numDocsWithValue; - NumericEntry nonMissingValues; - NumberType numberType; - - } - - /** metadata entry for a binary docvalues field */ - static class BinaryEntry { - private BinaryEntry() {} - /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ - long missingOffset; - /** offset to the actual binary values */ - long offset; - - int format; - /** count of values written */ - public long count; - int minLength; - int maxLength; - /** offset to the addressing data that maps a value to its slice of the byte[] */ - public long addressesOffset, addressesEndOffset; - /** meta data for addresses */ - public DirectMonotonicReader.Meta addressesMeta; - /** offset to the reverse index */ - public long reverseIndexOffset; - /** packed ints version used to encode addressing information */ - public int packedIntsVersion; - /** packed ints blocksize */ - public int blockSize; - } - - /** metadata entry for a sorted-set docvalues field */ - static class SortedSetEntry { - private SortedSetEntry() {} - int format; - - long[] table; - int[] tableOffsets; - } - - // internally we compose complex dv (sorted/sortedset) from other ones - static abstract class LongBinaryDocValues extends LegacyBinaryDocValues { - @Override - public final BytesRef get(int docID) { - return get((long)docID); + private SortedDocValues getSorted(SortedEntry entry) throws IOException { + if (entry.docsWithFieldOffset == -2) { + return DocValues.emptySorted(); } - abstract BytesRef get(long id); - } - - // used for reverse lookup to a small range of blocks - static class ReverseTermsIndex implements Accountable { - public MonotonicBlockPackedReader termAddresses; - public PagedBytes.Reader terms; - - @Override - public long ramBytesUsed() { - return termAddresses.ramBytesUsed() + terms.ramBytesUsed(); + final LongValues ords; + if (entry.bitsPerValue == 0) { + ords = new LongValues() { + @Override + public long get(long index) { + return 0L; + } + }; + } else { + final RandomAccessInput slice = data.randomAccessSlice(entry.ordsOffset, entry.ordsLength); + ords = DirectReader.getInstance(slice, entry.bitsPerValue); } - @Override - public Collection getChildResources() { - List resources = new ArrayList<>(); - resources.add(Accountables.namedAccountable("term bytes", terms)); - resources.add(Accountables.namedAccountable("term addresses", termAddresses)); - return Collections.unmodifiableList(resources); - } + if (entry.docsWithFieldOffset == -1) { + // dense + return new BaseSortedDocValues(entry, data) { - @Override - public String toString() { - return getClass().getSimpleName() + "(size=" + termAddresses.size() + ")"; + int doc = -1; + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int docID() { + return doc; + } + + @Override + public long cost() { + return maxDoc; + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public int ordValue() { + return (int) ords.get(doc); + } + }; + } else { + // sparse + final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField); + return new BaseSortedDocValues(entry, data) { + + @Override + public int nextDoc() throws IOException { + return disi.nextDoc(); + } + + @Override + public int docID() { + return disi.docID(); + } + + @Override + public long cost() { + return disi.cost(); + } + + @Override + public int advance(int target) throws IOException { + return disi.advance(target); + } + + @Override + public int ordValue() { + return (int) ords.get(disi.index()); + } + }; } } - //in the compressed case, we add a few additional operations for - //more efficient reverse lookup and enumeration - static final class CompressedBinaryDocValues extends LongBinaryDocValues { - final long numValues; - final long numIndexValues; - final int maxTermLength; - final MonotonicBlockPackedReader addresses; + private static abstract class BaseSortedDocValues extends SortedDocValues { + + final SortedEntry entry; final IndexInput data; - final CompressedBinaryTermsEnum termsEnum; - final PagedBytes.Reader reverseTerms; - final MonotonicBlockPackedReader reverseAddresses; - final long numReverseIndexValues; + final TermsEnum termsEnum; - public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, ReverseTermsIndex index, IndexInput data) throws IOException { - this.maxTermLength = bytes.maxLength; - this.numValues = bytes.count; - this.addresses = addresses; - this.numIndexValues = addresses.size(); + BaseSortedDocValues(SortedEntry entry, IndexInput data) throws IOException { + this.entry = entry; this.data = data; - this.reverseTerms = index.terms; - this.reverseAddresses = index.termAddresses; - this.numReverseIndexValues = reverseAddresses.size(); - this.termsEnum = getTermsEnum(data); + this.termsEnum = termsEnum(); } @Override - public BytesRef get(long id) { - try { - termsEnum.seekExact(id); - return termsEnum.term(); - } catch (IOException e) { - throw new RuntimeException(e); + public int getValueCount() { + return Math.toIntExact(entry.termsDictSize); + } + + @Override + public BytesRef lookupOrd(int ord) throws IOException { + termsEnum.seekExact(ord); + return termsEnum.term(); + } + + @Override + public int lookupTerm(BytesRef key) throws IOException { + SeekStatus status = termsEnum.seekCeil(key); + switch (status) { + case FOUND: + return Math.toIntExact(termsEnum.ord()); + default: + return Math.toIntExact(-1L - termsEnum.ord()); } } - long lookupTerm(BytesRef key) { - try { - switch (termsEnum.seekCeil(key)) { - case FOUND: return termsEnum.ord(); - case NOT_FOUND: return -termsEnum.ord()-1; - default: return -numValues-1; - } - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - } - - TermsEnum getTermsEnum() { - try { - return getTermsEnum(data.clone()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException { - return new CompressedBinaryTermsEnum(input); - } - - class CompressedBinaryTermsEnum extends TermsEnum { - private long currentOrd = -1; - // offset to the start of the current block - private long currentBlockStart; - private final IndexInput input; - // delta from currentBlockStart to start of each term - private final int offsets[] = new int[INTERVAL_COUNT]; - private final byte buffer[] = new byte[2*INTERVAL_COUNT-1]; - - private final BytesRef term = new BytesRef(maxTermLength); - private final BytesRef firstTerm = new BytesRef(maxTermLength); - private final BytesRef scratch = new BytesRef(); - - CompressedBinaryTermsEnum(IndexInput input) throws IOException { - this.input = input; - input.seek(0); - } - - private void readHeader() throws IOException { - firstTerm.length = input.readVInt(); - input.readBytes(firstTerm.bytes, 0, firstTerm.length); - input.readBytes(buffer, 0, INTERVAL_COUNT-1); - if (buffer[0] == -1) { - readShortAddresses(); - } else { - readByteAddresses(); - } - currentBlockStart = input.getFilePointer(); - } - - // read single byte addresses: each is delta - 2 - // (shared prefix byte and length > 0 are both implicit) - private void readByteAddresses() throws IOException { - int addr = 0; - for (int i = 1; i < offsets.length; i++) { - addr += 2 + (buffer[i-1] & 0xFF); - offsets[i] = addr; - } - } - - // read double byte addresses: each is delta - 2 - // (shared prefix byte and length > 0 are both implicit) - private void readShortAddresses() throws IOException { - input.readBytes(buffer, INTERVAL_COUNT-1, INTERVAL_COUNT); - int addr = 0; - for (int i = 1; i < offsets.length; i++) { - int x = i<<1; - addr += 2 + ((buffer[x-1] << 8) | (buffer[x] & 0xFF)); - offsets[i] = addr; - } - } - - // set term to the first term - private void readFirstTerm() throws IOException { - term.length = firstTerm.length; - System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, term.length); - } - - // read term at offset, delta encoded from first term - private void readTerm(int offset) throws IOException { - int start = input.readByte() & 0xFF; - System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, start); - int suffix = offsets[offset] - offsets[offset-1] - 1; - input.readBytes(term.bytes, start, suffix); - term.length = start + suffix; - } - - @Override - public BytesRef next() throws IOException { - currentOrd++; - if (currentOrd >= numValues) { - return null; - } else { - int offset = (int) (currentOrd & INTERVAL_MASK); - if (offset == 0) { - // switch to next block - readHeader(); - readFirstTerm(); - } else { - readTerm(offset); - } - return term; - } - } - - // binary search reverse index to find smaller - // range of blocks to search - long binarySearchIndex(BytesRef text) throws IOException { - long low = 0; - long high = numReverseIndexValues - 1; - while (low <= high) { - long mid = (low + high) >>> 1; - reverseTerms.fill(scratch, reverseAddresses.get(mid)); - int cmp = scratch.compareTo(text); - - if (cmp < 0) { - low = mid + 1; - } else if (cmp > 0) { - high = mid - 1; - } else { - return mid; - } - } - return high; - } - - // binary search against first term in block range - // to find term's block - long binarySearchBlock(BytesRef text, long low, long high) throws IOException { - while (low <= high) { - long mid = (low + high) >>> 1; - input.seek(addresses.get(mid)); - term.length = input.readVInt(); - input.readBytes(term.bytes, 0, term.length); - int cmp = term.compareTo(text); - - if (cmp < 0) { - low = mid + 1; - } else if (cmp > 0) { - high = mid - 1; - } else { - return mid; - } - } - return high; - } - - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - // locate block: narrow to block range with index, then search blocks - final long block; - long indexPos = binarySearchIndex(text); - if (indexPos < 0) { - block = 0; - } else { - long low = indexPos << BLOCK_INTERVAL_SHIFT; - long high = Math.min(numIndexValues - 1, low + BLOCK_INTERVAL_MASK); - block = Math.max(low, binarySearchBlock(text, low, high)); - } - - // position before block, then scan to term. - input.seek(addresses.get(block)); - currentOrd = (block << INTERVAL_SHIFT) - 1; - - while (next() != null) { - int cmp = term.compareTo(text); - if (cmp == 0) { - return SeekStatus.FOUND; - } else if (cmp > 0) { - return SeekStatus.NOT_FOUND; - } - } - return SeekStatus.END; - } - - @Override - public void seekExact(long ord) throws IOException { - long block = ord >>> INTERVAL_SHIFT; - if (block != currentOrd >>> INTERVAL_SHIFT) { - // switch to different block - input.seek(addresses.get(block)); - readHeader(); - } - - currentOrd = ord; - - int offset = (int) (ord & INTERVAL_MASK); - if (offset == 0) { - readFirstTerm(); - } else { - input.seek(currentBlockStart + offsets[offset-1]); - readTerm(offset); - } - } - - @Override - public BytesRef term() throws IOException { - return term; - } - - @Override - public long ord() throws IOException { - return currentOrd; - } - - @Override - public int docFreq() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long totalTermFreq() throws IOException { - return -1; - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - + @Override + public TermsEnum termsEnum() throws IOException { + return new TermsDict(entry, data); } } + + private static abstract class BaseSortedSetDocValues extends SortedSetDocValues { + + final SortedSetEntry entry; + final IndexInput data; + final TermsEnum termsEnum; + + BaseSortedSetDocValues(SortedSetEntry entry, IndexInput data) throws IOException { + this.entry = entry; + this.data = data; + this.termsEnum = termsEnum(); + } + + @Override + public long getValueCount() { + return entry.termsDictSize; + } + + @Override + public BytesRef lookupOrd(long ord) throws IOException { + termsEnum.seekExact(ord); + return termsEnum.term(); + } + + @Override + public long lookupTerm(BytesRef key) throws IOException { + SeekStatus status = termsEnum.seekCeil(key); + switch (status) { + case FOUND: + return termsEnum.ord(); + default: + return -1L - termsEnum.ord(); + } + } + + @Override + public TermsEnum termsEnum() throws IOException { + return new TermsDict(entry, data); + } + } + + private static class TermsDict extends TermsEnum { + + final TermsDictEntry entry; + final LongValues blockAddresses; + final IndexInput bytes; + final long blockMask; + final LongValues indexAddresses; + final IndexInput indexBytes; + final BytesRef term; + long ord = -1; + + TermsDict(TermsDictEntry entry, IndexInput data) throws IOException { + this.entry = entry; + RandomAccessInput addressesSlice = data.randomAccessSlice(entry.termsAddressesOffset, entry.termsAddressesLength); + blockAddresses = DirectMonotonicReader.getInstance(entry.termsAddressesMeta, addressesSlice); + bytes = data.slice("terms", entry.termsDataOffset, entry.termsDataLength); + blockMask = (1L << entry.termsDictBlockShift) - 1; + RandomAccessInput indexAddressesSlice = data.randomAccessSlice(entry.termsIndexAddressesOffset, entry.termsIndexAddressesLength); + indexAddresses = DirectMonotonicReader.getInstance(entry.termsIndexAddressesMeta, indexAddressesSlice); + indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength); + term = new BytesRef(entry.maxTermLength); + } + + @Override + public BytesRef next() throws IOException { + if (++ord >= entry.termsDictSize) { + return null; + } + if ((ord & blockMask) == 0L) { + term.length = bytes.readVInt(); + bytes.readBytes(term.bytes, 0, term.length); + } else { + final int token = Byte.toUnsignedInt(bytes.readByte()); + int prefixLength = token & 0x0F; + int suffixLength = 1 + (token >>> 4); + if (prefixLength == 15) { + prefixLength += bytes.readVInt(); + } + if (suffixLength == 16) { + suffixLength += bytes.readVInt(); + } + term.length = prefixLength + suffixLength; + bytes.readBytes(term.bytes, prefixLength, suffixLength); + } + return term; + } + + @Override + public void seekExact(long ord) throws IOException { + if (ord < 0 || ord >= entry.termsDictSize) { + throw new IndexOutOfBoundsException(); + } + final long blockIndex = ord >>> entry.termsDictBlockShift; + final long blockAddress = blockAddresses.get(blockIndex); + bytes.seek(blockAddress); + this.ord = (blockIndex << entry.termsDictBlockShift) - 1; + do { + next(); + } while (this.ord < ord); + } + + private BytesRef getTermFromIndex(long index) throws IOException { + assert index >= 0 && index <= (entry.termsDictSize - 1) >>> entry.termsDictIndexShift; + final long start = indexAddresses.get(index); + term.length = (int) (indexAddresses.get(index + 1) - start); + indexBytes.seek(start); + indexBytes.readBytes(term.bytes, 0, term.length); + return term; + } + + private long seekTermsIndex(BytesRef text) throws IOException { + long lo = 0L; + long hi = (entry.termsDictSize - 1) >>> entry.termsDictIndexShift; + while (lo <= hi) { + final long mid = (lo + hi) >>> 1; + getTermFromIndex(mid); + final int cmp = term.compareTo(text); + if (cmp <= 0) { + lo = mid + 1; + } else { + hi = mid - 1; + } + } + + assert hi < 0 || getTermFromIndex(hi).compareTo(text) <= 0; + assert hi == ((entry.termsDictSize - 1) >>> entry.termsDictIndexShift) || getTermFromIndex(hi + 1).compareTo(text) > 0; + + return hi; + } + + private BytesRef getFirstTermFromBlock(long block) throws IOException { + assert block >= 0 && block <= (entry.termsDictSize - 1) >>> entry.termsDictBlockShift; + final long blockAddress = blockAddresses.get(block); + bytes.seek(blockAddress); + term.length = bytes.readVInt(); + bytes.readBytes(term.bytes, 0, term.length); + return term; + } + + private long seekBlock(BytesRef text) throws IOException { + long index = seekTermsIndex(text); + if (index == -1L) { + return -1L; + } + + long ordLo = index << entry.termsDictIndexShift; + long ordHi = Math.min(entry.termsDictSize, ordLo + (1L << entry.termsDictIndexShift)) - 1L; + + long blockLo = ordLo >>> entry.termsDictBlockShift; + long blockHi = ordHi >>> entry.termsDictBlockShift; + + while (blockLo <= blockHi) { + final long blockMid = (blockLo + blockHi) >>> 1; + getFirstTermFromBlock(blockMid); + final int cmp = term.compareTo(text); + if (cmp <= 0) { + blockLo = blockMid + 1; + } else { + blockHi = blockMid - 1; + } + } + + assert blockHi < 0 || getFirstTermFromBlock(blockHi).compareTo(text) <= 0; + assert blockHi == ((entry.termsDictSize - 1) >>> entry.termsDictBlockShift) || getFirstTermFromBlock(blockHi + 1).compareTo(text) > 0; + + return blockHi; + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + final long block = seekBlock(text); + if (block == -1) { + // before the first term + seekExact(0L); + return SeekStatus.NOT_FOUND; + } + final long blockAddress = blockAddresses.get(block); + this.ord = block << entry.termsDictBlockShift; + bytes.seek(blockAddress); + term.length = bytes.readVInt(); + bytes.readBytes(term.bytes, 0, term.length); + while (true) { + int cmp = term.compareTo(text); + if (cmp == 0) { + return SeekStatus.FOUND; + } else if (cmp > 0) { + return SeekStatus.NOT_FOUND; + } + if (next() == null) { + return SeekStatus.END; + } + } + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public long ord() throws IOException { + return ord; + } + + @Override + public long totalTermFreq() throws IOException { + return -1L; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + } + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + SortedNumericEntry entry = sortedNumerics.get(field.name); + if (entry.numValues == entry.numDocsWithField) { + return DocValues.singleton(getNumeric(entry)); + } + + final RandomAccessInput addressesInput = data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); + final LongValues addresses = DirectMonotonicReader.getInstance(entry.addressesMeta, addressesInput); + + final LongValues values = getNumericValues(entry); + + if (entry.docsWithFieldOffset == -1) { + // dense + return new SortedNumericDocValues() { + + int doc = -1; + long start, end; + int count; + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int docID() { + return doc; + } + + @Override + public long cost() { + return maxDoc; + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return doc = NO_MORE_DOCS; + } + start = addresses.get(target); + end = addresses.get(target + 1L); + count = (int) (end - start); + return doc = target; + } + + @Override + public long nextValue() throws IOException { + return values.get(start++); + } + + @Override + public int docValueCount() { + return count; + } + }; + } else { + // sparse + final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField); + return new SortedNumericDocValues() { + + boolean set; + long start, end; + int count; + + @Override + public int nextDoc() throws IOException { + set = false; + return disi.nextDoc(); + } + + @Override + public int docID() { + return disi.docID(); + } + + @Override + public long cost() { + return disi.cost(); + } + + @Override + public int advance(int target) throws IOException { + set = false; + return disi.advance(target); + } + + @Override + public long nextValue() throws IOException { + set(); + return values.get(start++); + } + + @Override + public int docValueCount() { + set(); + return count; + } + + private void set() { + if (set == false) { + final int index = disi.index(); + start = addresses.get(index); + end = addresses.get(index + 1L); + count = (int) (end - start); + set = true; + } + } + + }; + } + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + SortedSetEntry entry = sortedSets.get(field.name); + if (entry.singleValueEntry != null) { + return DocValues.singleton(getSorted(entry.singleValueEntry)); + } + + final RandomAccessInput slice = data.randomAccessSlice(entry.ordsOffset, entry.ordsLength); + final LongValues ords = DirectReader.getInstance(slice, entry.bitsPerValue); + + final RandomAccessInput addressesInput = data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); + final LongValues addresses = DirectMonotonicReader.getInstance(entry.addressesMeta, addressesInput); + + if (entry.docsWithFieldOffset == -1) { + // dense + return new BaseSortedSetDocValues(entry, data) { + + int doc = -1; + long start; + long end; + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int docID() { + return doc; + } + + @Override + public long cost() { + return maxDoc; + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return doc = NO_MORE_DOCS; + } + start = addresses.get(target); + end = addresses.get(target + 1L); + return doc = target; + } + + @Override + public long nextOrd() throws IOException { + if (start == end) { + return NO_MORE_ORDS; + } + return ords.get(start++); + } + + }; + } else { + // sparse + final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField); + return new BaseSortedSetDocValues(entry, data) { + + boolean set; + long start; + long end = 0; + + @Override + public int nextDoc() throws IOException { + set = false; + return disi.nextDoc(); + } + + @Override + public int docID() { + return disi.docID(); + } + + @Override + public long cost() { + return disi.cost(); + } + + @Override + public int advance(int target) throws IOException { + set = false; + return disi.advance(target); + } + + @Override + public long nextOrd() throws IOException { + if (set == false) { + final int index = disi.index(); + final long start = addresses.get(index); + this.start = start + 1; + end = addresses.get(index + 1L); + set = true; + return ords.get(start); + } else if (start == end) { + return NO_MORE_ORDS; + } else { + return ords.get(start++); + } + } + + }; + } + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(data); + } + } diff --git a/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java index a4b90493011..4e9f0e08f0d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java @@ -44,12 +44,12 @@ public abstract class EmptyDocValuesProducer extends DocValuesProducer { } @Override - public SortedNumericDocValues getSortedNumeric(FieldInfo field) { + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { throw new UnsupportedOperationException(); } @Override - public SortedSetDocValues getSortedSet(FieldInfo field) { + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { throw new UnsupportedOperationException(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/LegacySortedSetDocValues.java b/lucene/core/src/java/org/apache/lucene/index/LegacySortedSetDocValues.java index dae11792a58..0c6c809d920 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LegacySortedSetDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/LegacySortedSetDocValues.java @@ -17,6 +17,8 @@ package org.apache.lucene.index; +import java.io.IOException; + import org.apache.lucene.util.BytesRef; /** @@ -103,7 +105,7 @@ public abstract class LegacySortedSetDocValues { * Returns a {@link TermsEnum} over the values. * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}. */ - public TermsEnum termsEnum() { + public TermsEnum termsEnum() throws IOException { throw new UnsupportedOperationException(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SingletonSortedSetDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SingletonSortedSetDocValues.java index 225b6a6763f..cc7360e1d17 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SingletonSortedSetDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SingletonSortedSetDocValues.java @@ -95,7 +95,7 @@ final class SingletonSortedSetDocValues extends SortedSetDocValues { } @Override - public TermsEnum termsEnum() { + public TermsEnum termsEnum() throws IOException { return in.termsEnum(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java index ee70a64a9e8..7ff084f2cb7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java @@ -104,7 +104,7 @@ public abstract class SortedDocValues extends BinaryDocValues { * Returns a {@link TermsEnum} over the values. * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}. */ - public TermsEnum termsEnum() { + public TermsEnum termsEnum() throws IOException { return new SortedDocValuesTermsEnum(this); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java index e53a0e75a14..439843b6f3f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java @@ -98,7 +98,7 @@ public abstract class SortedSetDocValues extends DocIdSetIterator { * Returns a {@link TermsEnum} over the values. * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}. */ - public TermsEnum termsEnum() { + public TermsEnum termsEnum() throws IOException { return new SortedSetDocValuesTermsEnum(this); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java index 0bf7a8ecd65..46afe0dd2b8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java @@ -81,7 +81,7 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod { TermsEnum termsEnum = query.getTermsEnum(new Terms() { @Override - public TermsEnum iterator() { + public TermsEnum iterator() throws IOException { return fcsi.termsEnum(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicReader.java b/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicReader.java index bdefdf367c8..676efcdee24 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicReader.java @@ -46,7 +46,6 @@ public final class DirectMonotonicReader { public static class Meta implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Meta.class); - final long numValues; final int blockShift; final int numBlocks; final long[] mins; @@ -55,7 +54,6 @@ public final class DirectMonotonicReader { final long[] offsets; Meta(long numValues, int blockShift) { - this.numValues = numValues; this.blockShift = blockShift; long numBlocks = numValues >>> blockShift; if ((numBlocks << blockShift) < numValues) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java index fae82e03835..5ad701ee6e0 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java @@ -25,14 +25,13 @@ import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; +import java.util.function.Supplier; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.asserting.AssertingCodec; -import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValues; -import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValuesRandomAccessWrapper; import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; @@ -62,7 +61,6 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum.SeekStatus; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMFile; @@ -70,7 +68,6 @@ import org.apache.lucene.store.RAMInputStream; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.LongValues; import org.apache.lucene.util.TestUtil; /** @@ -123,7 +120,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT public void testTermsEnumFixedWidth() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { - doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 10, 10); + doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 10, 10)); } } @@ -131,7 +128,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT public void testTermsEnumVariableWidth() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { - doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 1, 500); + doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 1, 500)); } } @@ -139,7 +136,21 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT public void testTermsEnumRandomMany() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { - doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500); + doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), () -> TestUtil.randomSimpleString(random(), 1, 500)); + } + } + + public void testTermsEnumLongSharedPrefixes() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> { + char[] chars = new char[random().nextInt(500)]; + Arrays.fill(chars, 'a'); + if (chars.length > 0) { + chars[random().nextInt(chars.length)] = 'b'; + } + return new String(chars); + }); } } @@ -269,7 +280,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT // TODO: try to refactor this and some termsenum tests into the base class. // to do this we need to fix the test class to get a DVF not a Codec so we can setup // the postings format correctly. - private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception { + private void doTestTermsEnumRandom(int numDocs, Supplier valuesProducer) throws Exception { Directory dir = newFSDirectory(createTempDir()); IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); conf.setMergeScheduler(new SerialMergeScheduler()); @@ -294,12 +305,11 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT Document doc = new Document(); Field idField = new StringField("id", Integer.toString(i), Field.Store.NO); doc.add(idField); - final int length = TestUtil.nextInt(random(), minLength, maxLength); int numValues = random().nextInt(17); // create a random list of strings List values = new ArrayList<>(); for (int v = 0; v < numValues; v++) { - values.add(TestUtil.randomSimpleString(random(), minLength, length)); + values.add(valuesProducer.get()); } // add in any order to the indexed field @@ -429,92 +439,6 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT } } - public void testSparseLongValues() throws IOException { - final int iters = atLeast(5); - for (int iter = 0; iter < iters; ++iter) { - final int numDocs = TestUtil.nextInt(random(), 0, 100); - final int[] docIds = new int[numDocs]; - final long[] values = new long[numDocs]; - final int maxDoc; - if (numDocs == 0) { - maxDoc = 1 + random().nextInt(10); - } else { - docIds[0] = random().nextInt(10); - for (int i = 1; i < docIds.length; ++i) { - docIds[i] = docIds[i - 1] + 1 + random().nextInt(100); - } - maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10); - } - for (int i = 0; i < values.length; ++i) { - values[i] = random().nextLong(); - } - final long missingValue = random().nextLong(); - final LongValues docIdsValues = new LongValues() { - @Override - public long get(long index) { - return docIds[Math.toIntExact(index)]; - } - }; - final LongValues valuesValues = new LongValues() { - @Override - public long get(long index) { - return values[Math.toIntExact(index)]; - } - }; - final SparseNumericDocValues sparseValues = new SparseNumericDocValues(numDocs, docIdsValues, valuesValues); - - // sequential access - assertEquals(-1, sparseValues.docID()); - for (int i = 0; i < docIds.length; ++i) { - assertEquals(docIds[i], sparseValues.nextDoc()); - } - assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.nextDoc()); - - // advance - for (int i = 0; i < 2000; ++i) { - final int target = TestUtil.nextInt(random(), 0, maxDoc); - int index = Arrays.binarySearch(docIds, target); - if (index < 0) { - index = -1 - index; - } - sparseValues.reset(); - if (index > 0) { - assertEquals(docIds[index - 1], sparseValues.advance(Math.toIntExact(docIds[index - 1]))); - } - if (index == docIds.length) { - assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.advance(target)); - } else { - assertEquals(docIds[index], sparseValues.advance(target)); - } - } - - final SparseNumericDocValuesRandomAccessWrapper raWrapper = new SparseNumericDocValuesRandomAccessWrapper(sparseValues, missingValue); - - // random-access - for (int i = 0; i < 2000; ++i) { - final int docId = TestUtil.nextInt(random(), 0, maxDoc - 1); - final int idx = Arrays.binarySearch(docIds, docId); - final long value = raWrapper.get(docId); - if (idx >= 0) { - assertEquals(values[idx], value); - } else { - assertEquals(missingValue, value); - } - } - - // sequential access - for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) { - final int idx = Arrays.binarySearch(docIds, docId); - final long value = raWrapper.get(docId); - if (idx >= 0) { - assertEquals(values[idx], value); - } else { - assertEquals(missingValue, value); - } - } - } - } - @Slow public void testSortedSetAroundBlockSize() throws IOException { final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;