mirror of https://github.com/apache/lucene.git
LUCENE-7489: Better sparsity support for Lucene70DocValuesFormat.
This commit is contained in:
parent
a4a314d160
commit
927fd51d64
|
@ -29,6 +29,9 @@ Bug Fixes
|
|||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7489: Better storage of sparse doc-values fields with the default
|
||||
codec. (Adrien Grand)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
|
||||
|
|
|
@ -928,7 +928,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum termsEnum() {
|
||||
public TermsEnum termsEnum() throws IOException {
|
||||
if (binary instanceof CompressedBinaryDocValues) {
|
||||
return ((CompressedBinaryDocValues)binary).getTermsEnum();
|
||||
} else {
|
||||
|
@ -1233,7 +1233,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum termsEnum() {
|
||||
public TermsEnum termsEnum() throws IOException {
|
||||
if (binary instanceof CompressedBinaryDocValues) {
|
||||
return ((CompressedBinaryDocValues)binary).getTermsEnum();
|
||||
} else {
|
||||
|
@ -1292,7 +1292,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum termsEnum() {
|
||||
public TermsEnum termsEnum() throws IOException {
|
||||
if (binary instanceof CompressedBinaryDocValues) {
|
||||
return ((CompressedBinaryDocValues) binary).getTermsEnum();
|
||||
} else {
|
||||
|
@ -1490,12 +1490,8 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
}
|
||||
|
||||
TermsEnum getTermsEnum() {
|
||||
try {
|
||||
return getTermsEnum(data.clone());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
TermsEnum getTermsEnum() throws IOException {
|
||||
return getTermsEnum(data.clone());
|
||||
}
|
||||
|
||||
private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException {
|
||||
|
|
|
@ -361,7 +361,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
addSortedNumericField(mergeFieldInfo,
|
||||
new EmptyDocValuesProducer() {
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) {
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException {
|
||||
if (fieldInfo != mergeFieldInfo) {
|
||||
throw new IllegalArgumentException("wrong FieldInfo");
|
||||
}
|
||||
|
@ -375,11 +375,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
if (docValuesProducer != null) {
|
||||
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
|
||||
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) {
|
||||
try {
|
||||
values = docValuesProducer.getSortedNumeric(readerFieldInfo);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
values = docValuesProducer.getSortedNumeric(readerFieldInfo);
|
||||
}
|
||||
}
|
||||
if (values == null) {
|
||||
|
@ -391,12 +387,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
|
||||
final long finalCost = cost;
|
||||
|
||||
final DocIDMerger<SortedNumericDocValuesSub> docIDMerger;
|
||||
try {
|
||||
docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
final DocIDMerger<SortedNumericDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
|
||||
|
||||
return new SortedNumericDocValues() {
|
||||
|
||||
|
@ -521,7 +512,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
addSortedField(fieldInfo,
|
||||
new EmptyDocValuesProducer() {
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo fieldInfoIn) {
|
||||
public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException {
|
||||
if (fieldInfoIn != fieldInfo) {
|
||||
throw new IllegalArgumentException("wrong FieldInfo");
|
||||
}
|
||||
|
@ -536,11 +527,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
if (docValuesProducer != null) {
|
||||
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
|
||||
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
|
||||
try {
|
||||
values = docValuesProducer.getSorted(readerFieldInfo);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
values = docValuesProducer.getSorted(readerFieldInfo);
|
||||
}
|
||||
}
|
||||
if (values == null) {
|
||||
|
@ -553,12 +540,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
|
||||
final long finalCost = cost;
|
||||
|
||||
final DocIDMerger<SortedDocValuesSub> docIDMerger;
|
||||
try {
|
||||
docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
final DocIDMerger<SortedDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
|
||||
|
||||
return new SortedDocValues() {
|
||||
private int docID = -1;
|
||||
|
@ -693,7 +675,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
addSortedSetField(mergeFieldInfo,
|
||||
new EmptyDocValuesProducer() {
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) {
|
||||
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
|
||||
if (fieldInfo != mergeFieldInfo) {
|
||||
throw new IllegalArgumentException("wrong FieldInfo");
|
||||
}
|
||||
|
@ -709,11 +691,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
if (docValuesProducer != null) {
|
||||
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
|
||||
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
|
||||
try {
|
||||
values = docValuesProducer.getSortedSet(readerFieldInfo);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
values = docValuesProducer.getSortedSet(readerFieldInfo);
|
||||
}
|
||||
}
|
||||
if (values == null) {
|
||||
|
@ -723,12 +701,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
|
||||
}
|
||||
|
||||
final DocIDMerger<SortedSetDocValuesSub> docIDMerger;
|
||||
try {
|
||||
docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
final DocIDMerger<SortedSetDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
|
||||
|
||||
final long finalCost = cost;
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,39 +23,64 @@ import org.apache.lucene.codecs.DocValuesConsumer;
|
|||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.packed.DirectWriter;
|
||||
|
||||
/**
|
||||
* Lucene 7.0 DocValues format.
|
||||
* <p>
|
||||
* Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies:
|
||||
* Documents that have a value for the field are encoded in a way that it is always possible to
|
||||
* know the ordinal of the current document in the set of documents that have a value. For instance,
|
||||
* say the set of documents that have a value for the field is <tt>{1, 5, 6, 11}</tt>. When the
|
||||
* iterator is on <tt>6</tt>, it knows that this is the 3rd item of the set. This way, values can
|
||||
* be stored densely and accessed based on their index at search time. If all documents in a segment
|
||||
* have a value for the field, the index is the same as the doc ID, so this case is encoded implicitly
|
||||
* and is very fast at query time. On the other hand if some documents are missing a value for the
|
||||
* field then the set of documents that have a value is encoded into blocks. All doc IDs that share
|
||||
* the same upper 16 bits are encoded into the same block with the following strategies:
|
||||
* <ul>
|
||||
* <li>SPARSE: This strategy is used when a block contains at most 4095 documents. The lower 16
|
||||
* bits of doc IDs are stored as {@link DataOutput#writeShort(short) shorts} while the upper
|
||||
* 16 bits are given by the block ID.
|
||||
* <li>DENSE: This strategy is used when a block contains between 4096 and 65535 documents. The
|
||||
* lower bits of doc IDs are stored in a bit set. Advancing is performed using
|
||||
* {@link Long#numberOfTrailingZeros(long) ntz} operations while the index is computed by
|
||||
* accumulating the {@link Long#bitCount(long) bit counts} of the visited longs.
|
||||
* <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that
|
||||
* the block is full. In that case doc IDs do not need to be stored explicitly. This is
|
||||
* typically faster than both SPARSE and DENSE which is a reason why it is preferable to have
|
||||
* all documents that have a value for a field using contiguous doc IDs, for instance by
|
||||
* using {@link IndexWriterConfig#setIndexSort(org.apache.lucene.search.Sort) index sorting}.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Then the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) are
|
||||
* encoded using the following strategies:
|
||||
* <p>
|
||||
* {@link DocValuesType#NUMERIC NUMERIC}:
|
||||
* <ul>
|
||||
* <li>Delta-compressed: per-document integers written as deltas from the minimum value,
|
||||
* compressed with bitpacking. For more information, see {@link DirectWriter}.
|
||||
* <li>Table-compressed: when the number of unique values is very small (< 256), and
|
||||
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||
* to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
|
||||
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||
* to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
|
||||
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
|
||||
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
||||
* <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written
|
||||
* as blocks of bitpacked integers, encoding the deviation from the expected delta.
|
||||
* <li>Const-compressed: when there is only one possible non-missing value, only the missing
|
||||
* bitset is encoded.
|
||||
* <li>Sparse-compressed: only documents with a value are stored, and lookups are performed
|
||||
* using binary search.
|
||||
* <li>Const-compressed: when there is only one possible value, no per-document data is needed and
|
||||
* this value is encoded alone.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#BINARY BINARY}:
|
||||
* <ul>
|
||||
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
|
||||
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
|
||||
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
|
||||
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||
* for each document. The addresses are written as Monotonic-compressed numerics.
|
||||
* <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
|
||||
* completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed
|
||||
|
@ -64,27 +89,21 @@ import org.apache.lucene.util.packed.DirectWriter;
|
|||
* <p>
|
||||
* {@link DocValuesType#SORTED SORTED}:
|
||||
* <ul>
|
||||
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Binary,
|
||||
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-compressed Binary,
|
||||
* along with the per-document ordinals written using one of the numeric strategies above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_SET SORTED_SET}:
|
||||
* <ul>
|
||||
* <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
|
||||
* <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned
|
||||
* an id, a lookup table is written and the mapping from document to set id is written using the
|
||||
* numeric strategies above.
|
||||
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
|
||||
* an ordinal list and per-document index into this list are written using the numeric strategies
|
||||
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
|
||||
* an ordinal list and per-document index into this list are written using the numeric strategies
|
||||
* above.
|
||||
* </ul>
|
||||
* <p>
|
||||
* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
|
||||
* <ul>
|
||||
* <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
|
||||
* <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned
|
||||
* an id, a lookup table is written and the mapping from document to set id is written using the
|
||||
* numeric strategies above.
|
||||
* <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
|
||||
* strategies above.
|
||||
* </ul>
|
||||
|
@ -112,72 +131,30 @@ public final class Lucene70DocValuesFormat extends DocValuesFormat {
|
|||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new Lucene70DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
static final String DATA_CODEC = "Lucene54DocValuesData";
|
||||
|
||||
static final String DATA_CODEC = "Lucene70DocValuesData";
|
||||
static final String DATA_EXTENSION = "dvd";
|
||||
static final String META_CODEC = "Lucene54DocValuesMetadata";
|
||||
static final String META_CODEC = "Lucene70DocValuesMetadata";
|
||||
static final String META_EXTENSION = "dvm";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
|
||||
// indicates docvalues type
|
||||
static final byte NUMERIC = 0;
|
||||
static final byte BINARY = 1;
|
||||
static final byte SORTED = 2;
|
||||
static final byte SORTED_SET = 3;
|
||||
static final byte SORTED_NUMERIC = 4;
|
||||
|
||||
// address terms in blocks of 16 terms
|
||||
static final int INTERVAL_SHIFT = 4;
|
||||
static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT;
|
||||
static final int INTERVAL_MASK = INTERVAL_COUNT - 1;
|
||||
|
||||
// build reverse index from every 1024th term
|
||||
static final int REVERSE_INTERVAL_SHIFT = 10;
|
||||
static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT;
|
||||
static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1;
|
||||
|
||||
// for conversion from reverse index to block
|
||||
static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT;
|
||||
static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT;
|
||||
static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1;
|
||||
|
||||
/** Compressed using packed blocks of ints. */
|
||||
static final int DELTA_COMPRESSED = 0;
|
||||
/** Compressed by computing the GCD. */
|
||||
static final int GCD_COMPRESSED = 1;
|
||||
/** Compressed by giving IDs to unique values. */
|
||||
static final int TABLE_COMPRESSED = 2;
|
||||
/** Compressed with monotonically increasing values */
|
||||
static final int MONOTONIC_COMPRESSED = 3;
|
||||
/** Compressed with constant value (uses only missing bitset) */
|
||||
static final int CONST_COMPRESSED = 4;
|
||||
/** Compressed with sparse arrays. */
|
||||
static final int SPARSE_COMPRESSED = 5;
|
||||
|
||||
/** Uncompressed binary, written directly (fixed length). */
|
||||
static final int BINARY_FIXED_UNCOMPRESSED = 0;
|
||||
/** Uncompressed binary, written directly (variable length). */
|
||||
static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
|
||||
/** Compressed binary with shared prefixes */
|
||||
static final int BINARY_PREFIX_COMPRESSED = 2;
|
||||
|
||||
/** Standard storage for sorted set values with 1 level of indirection:
|
||||
* {@code docId -> address -> ord}. */
|
||||
static final int SORTED_WITH_ADDRESSES = 0;
|
||||
/** Single-valued sorted set values, encoded as sorted values, so no level
|
||||
* of indirection: {@code docId -> ord}. */
|
||||
static final int SORTED_SINGLE_VALUED = 1;
|
||||
/** Compressed giving IDs to unique sets of values:
|
||||
* {@code docId -> setId -> ords} */
|
||||
static final int SORTED_SET_TABLE = 2;
|
||||
|
||||
/** placeholder for missing offset that means there are no missing values */
|
||||
static final int ALL_LIVE = -1;
|
||||
/** placeholder for missing offset that means all values are missing */
|
||||
static final int ALL_MISSING = -2;
|
||||
|
||||
// addressing uses 16k blocks
|
||||
static final int MONOTONIC_BLOCK_SIZE = 16384;
|
||||
static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
|
||||
|
||||
static final int TERMS_DICT_BLOCK_SHIFT = 4;
|
||||
static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
|
||||
static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;
|
||||
|
||||
static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10;
|
||||
static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
|
||||
static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -44,12 +44,12 @@ public abstract class EmptyDocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) {
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) {
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
|
@ -103,7 +105,7 @@ public abstract class LegacySortedSetDocValues {
|
|||
* Returns a {@link TermsEnum} over the values.
|
||||
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
|
||||
*/
|
||||
public TermsEnum termsEnum() {
|
||||
public TermsEnum termsEnum() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -95,7 +95,7 @@ final class SingletonSortedSetDocValues extends SortedSetDocValues {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum termsEnum() {
|
||||
public TermsEnum termsEnum() throws IOException {
|
||||
return in.termsEnum();
|
||||
}
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ public abstract class SortedDocValues extends BinaryDocValues {
|
|||
* Returns a {@link TermsEnum} over the values.
|
||||
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
|
||||
*/
|
||||
public TermsEnum termsEnum() {
|
||||
public TermsEnum termsEnum() throws IOException {
|
||||
return new SortedDocValuesTermsEnum(this);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -98,7 +98,7 @@ public abstract class SortedSetDocValues extends DocIdSetIterator {
|
|||
* Returns a {@link TermsEnum} over the values.
|
||||
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
|
||||
*/
|
||||
public TermsEnum termsEnum() {
|
||||
public TermsEnum termsEnum() throws IOException {
|
||||
return new SortedSetDocValuesTermsEnum(this);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -81,7 +81,7 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
|
|||
TermsEnum termsEnum = query.getTermsEnum(new Terms() {
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator() {
|
||||
public TermsEnum iterator() throws IOException {
|
||||
return fcsi.termsEnum();
|
||||
}
|
||||
|
||||
|
|
|
@ -46,7 +46,6 @@ public final class DirectMonotonicReader {
|
|||
public static class Meta implements Accountable {
|
||||
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Meta.class);
|
||||
|
||||
final long numValues;
|
||||
final int blockShift;
|
||||
final int numBlocks;
|
||||
final long[] mins;
|
||||
|
@ -55,7 +54,6 @@ public final class DirectMonotonicReader {
|
|||
final long[] offsets;
|
||||
|
||||
Meta(long numValues, int blockShift) {
|
||||
this.numValues = numValues;
|
||||
this.blockShift = blockShift;
|
||||
long numBlocks = numValues >>> blockShift;
|
||||
if ((numBlocks << blockShift) < numValues) {
|
||||
|
|
|
@ -25,14 +25,13 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValues;
|
||||
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValuesRandomAccessWrapper;
|
||||
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -62,7 +61,6 @@ import org.apache.lucene.index.SortedSetDocValues;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMFile;
|
||||
|
@ -70,7 +68,6 @@ import org.apache.lucene.store.RAMInputStream;
|
|||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
|
@ -123,7 +120,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
public void testTermsEnumFixedWidth() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 10, 10);
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 10, 10));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -131,7 +128,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
public void testTermsEnumVariableWidth() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 1, 500);
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 1, 500));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -139,7 +136,21 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
public void testTermsEnumRandomMany() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), () -> TestUtil.randomSimpleString(random(), 1, 500));
|
||||
}
|
||||
}
|
||||
|
||||
public void testTermsEnumLongSharedPrefixes() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> {
|
||||
char[] chars = new char[random().nextInt(500)];
|
||||
Arrays.fill(chars, 'a');
|
||||
if (chars.length > 0) {
|
||||
chars[random().nextInt(chars.length)] = 'b';
|
||||
}
|
||||
return new String(chars);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -269,7 +280,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
// TODO: try to refactor this and some termsenum tests into the base class.
|
||||
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
|
||||
// the postings format correctly.
|
||||
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
|
||||
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
|
||||
Directory dir = newFSDirectory(createTempDir());
|
||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
conf.setMergeScheduler(new SerialMergeScheduler());
|
||||
|
@ -294,12 +305,11 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
Document doc = new Document();
|
||||
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
|
||||
doc.add(idField);
|
||||
final int length = TestUtil.nextInt(random(), minLength, maxLength);
|
||||
int numValues = random().nextInt(17);
|
||||
// create a random list of strings
|
||||
List<String> values = new ArrayList<>();
|
||||
for (int v = 0; v < numValues; v++) {
|
||||
values.add(TestUtil.randomSimpleString(random(), minLength, length));
|
||||
values.add(valuesProducer.get());
|
||||
}
|
||||
|
||||
// add in any order to the indexed field
|
||||
|
@ -429,92 +439,6 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
}
|
||||
}
|
||||
|
||||
public void testSparseLongValues() throws IOException {
|
||||
final int iters = atLeast(5);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int numDocs = TestUtil.nextInt(random(), 0, 100);
|
||||
final int[] docIds = new int[numDocs];
|
||||
final long[] values = new long[numDocs];
|
||||
final int maxDoc;
|
||||
if (numDocs == 0) {
|
||||
maxDoc = 1 + random().nextInt(10);
|
||||
} else {
|
||||
docIds[0] = random().nextInt(10);
|
||||
for (int i = 1; i < docIds.length; ++i) {
|
||||
docIds[i] = docIds[i - 1] + 1 + random().nextInt(100);
|
||||
}
|
||||
maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10);
|
||||
}
|
||||
for (int i = 0; i < values.length; ++i) {
|
||||
values[i] = random().nextLong();
|
||||
}
|
||||
final long missingValue = random().nextLong();
|
||||
final LongValues docIdsValues = new LongValues() {
|
||||
@Override
|
||||
public long get(long index) {
|
||||
return docIds[Math.toIntExact(index)];
|
||||
}
|
||||
};
|
||||
final LongValues valuesValues = new LongValues() {
|
||||
@Override
|
||||
public long get(long index) {
|
||||
return values[Math.toIntExact(index)];
|
||||
}
|
||||
};
|
||||
final SparseNumericDocValues sparseValues = new SparseNumericDocValues(numDocs, docIdsValues, valuesValues);
|
||||
|
||||
// sequential access
|
||||
assertEquals(-1, sparseValues.docID());
|
||||
for (int i = 0; i < docIds.length; ++i) {
|
||||
assertEquals(docIds[i], sparseValues.nextDoc());
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.nextDoc());
|
||||
|
||||
// advance
|
||||
for (int i = 0; i < 2000; ++i) {
|
||||
final int target = TestUtil.nextInt(random(), 0, maxDoc);
|
||||
int index = Arrays.binarySearch(docIds, target);
|
||||
if (index < 0) {
|
||||
index = -1 - index;
|
||||
}
|
||||
sparseValues.reset();
|
||||
if (index > 0) {
|
||||
assertEquals(docIds[index - 1], sparseValues.advance(Math.toIntExact(docIds[index - 1])));
|
||||
}
|
||||
if (index == docIds.length) {
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.advance(target));
|
||||
} else {
|
||||
assertEquals(docIds[index], sparseValues.advance(target));
|
||||
}
|
||||
}
|
||||
|
||||
final SparseNumericDocValuesRandomAccessWrapper raWrapper = new SparseNumericDocValuesRandomAccessWrapper(sparseValues, missingValue);
|
||||
|
||||
// random-access
|
||||
for (int i = 0; i < 2000; ++i) {
|
||||
final int docId = TestUtil.nextInt(random(), 0, maxDoc - 1);
|
||||
final int idx = Arrays.binarySearch(docIds, docId);
|
||||
final long value = raWrapper.get(docId);
|
||||
if (idx >= 0) {
|
||||
assertEquals(values[idx], value);
|
||||
} else {
|
||||
assertEquals(missingValue, value);
|
||||
}
|
||||
}
|
||||
|
||||
// sequential access
|
||||
for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
|
||||
final int idx = Arrays.binarySearch(docIds, docId);
|
||||
final long value = raWrapper.get(docId);
|
||||
if (idx >= 0) {
|
||||
assertEquals(values[idx], value);
|
||||
} else {
|
||||
assertEquals(missingValue, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Slow
|
||||
public void testSortedSetAroundBlockSize() throws IOException {
|
||||
final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||
|
|
Loading…
Reference in New Issue