LUCENE-7489: Better sparsity support for Lucene70DocValuesFormat.

This commit is contained in:
Adrien Grand 2016-10-12 12:55:16 +02:00
parent a4a314d160
commit 927fd51d64
14 changed files with 1476 additions and 2545 deletions

View File

@ -29,6 +29,9 @@ Bug Fixes
Improvements
* LUCENE-7489: Better storage of sparse doc-values fields with the default
codec. (Adrien Grand)
Optimizations
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both

View File

@ -928,7 +928,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
}
@Override
public TermsEnum termsEnum() {
public TermsEnum termsEnum() throws IOException {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).getTermsEnum();
} else {
@ -1233,7 +1233,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
}
@Override
public TermsEnum termsEnum() {
public TermsEnum termsEnum() throws IOException {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).getTermsEnum();
} else {
@ -1292,7 +1292,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
}
@Override
public TermsEnum termsEnum() {
public TermsEnum termsEnum() throws IOException {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues) binary).getTermsEnum();
} else {
@ -1490,12 +1490,8 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close
}
}
TermsEnum getTermsEnum() {
try {
return getTermsEnum(data.clone());
} catch (IOException e) {
throw new RuntimeException(e);
}
TermsEnum getTermsEnum() throws IOException {
return getTermsEnum(data.clone());
}
private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException {

View File

@ -361,7 +361,7 @@ public abstract class DocValuesConsumer implements Closeable {
addSortedNumericField(mergeFieldInfo,
new EmptyDocValuesProducer() {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) {
public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException {
if (fieldInfo != mergeFieldInfo) {
throw new IllegalArgumentException("wrong FieldInfo");
}
@ -375,11 +375,7 @@ public abstract class DocValuesConsumer implements Closeable {
if (docValuesProducer != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) {
try {
values = docValuesProducer.getSortedNumeric(readerFieldInfo);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
values = docValuesProducer.getSortedNumeric(readerFieldInfo);
}
}
if (values == null) {
@ -391,12 +387,7 @@ public abstract class DocValuesConsumer implements Closeable {
final long finalCost = cost;
final DocIDMerger<SortedNumericDocValuesSub> docIDMerger;
try {
docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
final DocIDMerger<SortedNumericDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
return new SortedNumericDocValues() {
@ -521,7 +512,7 @@ public abstract class DocValuesConsumer implements Closeable {
addSortedField(fieldInfo,
new EmptyDocValuesProducer() {
@Override
public SortedDocValues getSorted(FieldInfo fieldInfoIn) {
public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException {
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong FieldInfo");
}
@ -536,11 +527,7 @@ public abstract class DocValuesConsumer implements Closeable {
if (docValuesProducer != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
try {
values = docValuesProducer.getSorted(readerFieldInfo);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
values = docValuesProducer.getSorted(readerFieldInfo);
}
}
if (values == null) {
@ -553,12 +540,7 @@ public abstract class DocValuesConsumer implements Closeable {
final long finalCost = cost;
final DocIDMerger<SortedDocValuesSub> docIDMerger;
try {
docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
final DocIDMerger<SortedDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
return new SortedDocValues() {
private int docID = -1;
@ -693,7 +675,7 @@ public abstract class DocValuesConsumer implements Closeable {
addSortedSetField(mergeFieldInfo,
new EmptyDocValuesProducer() {
@Override
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) {
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
if (fieldInfo != mergeFieldInfo) {
throw new IllegalArgumentException("wrong FieldInfo");
}
@ -709,11 +691,7 @@ public abstract class DocValuesConsumer implements Closeable {
if (docValuesProducer != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
try {
values = docValuesProducer.getSortedSet(readerFieldInfo);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
values = docValuesProducer.getSortedSet(readerFieldInfo);
}
}
if (values == null) {
@ -723,12 +701,7 @@ public abstract class DocValuesConsumer implements Closeable {
subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
}
final DocIDMerger<SortedSetDocValuesSub> docIDMerger;
try {
docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
final DocIDMerger<SortedSetDocValuesSub> docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null);
final long finalCost = cost;

View File

@ -23,39 +23,64 @@ import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.packed.DirectWriter;
/**
* Lucene 7.0 DocValues format.
* <p>
* Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies:
* Documents that have a value for the field are encoded in a way that it is always possible to
* know the ordinal of the current document in the set of documents that have a value. For instance,
* say the set of documents that have a value for the field is <tt>{1, 5, 6, 11}</tt>. When the
* iterator is on <tt>6</tt>, it knows that this is the 3rd item of the set. This way, values can
* be stored densely and accessed based on their index at search time. If all documents in a segment
* have a value for the field, the index is the same as the doc ID, so this case is encoded implicitly
* and is very fast at query time. On the other hand if some documents are missing a value for the
* field then the set of documents that have a value is encoded into blocks. All doc IDs that share
* the same upper 16 bits are encoded into the same block with the following strategies:
* <ul>
* <li>SPARSE: This strategy is used when a block contains at most 4095 documents. The lower 16
* bits of doc IDs are stored as {@link DataOutput#writeShort(short) shorts} while the upper
* 16 bits are given by the block ID.
* <li>DENSE: This strategy is used when a block contains between 4096 and 65535 documents. The
* lower bits of doc IDs are stored in a bit set. Advancing is performed using
* {@link Long#numberOfTrailingZeros(long) ntz} operations while the index is computed by
* accumulating the {@link Long#bitCount(long) bit counts} of the visited longs.
* <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that
* the block is full. In that case doc IDs do not need to be stored explicitly. This is
* typically faster than both SPARSE and DENSE which is a reason why it is preferable to have
* all documents that have a value for a field using contiguous doc IDs, for instance by
* using {@link IndexWriterConfig#setIndexSort(org.apache.lucene.search.Sort) index sorting}.
* </ul>
* <p>
* Then the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) are
* encoded using the following strategies:
* <p>
* {@link DocValuesType#NUMERIC NUMERIC}:
* <ul>
* <li>Delta-compressed: per-document integers written as deltas from the minimum value,
* compressed with bitpacking. For more information, see {@link DirectWriter}.
* <li>Table-compressed: when the number of unique values is very small (&lt; 256), and
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
* a lookup table is written instead. Each per-document entry is instead the ordinal
* to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
* a lookup table is written instead. Each per-document entry is instead the ordinal
* to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
* <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written
* as blocks of bitpacked integers, encoding the deviation from the expected delta.
* <li>Const-compressed: when there is only one possible non-missing value, only the missing
* bitset is encoded.
* <li>Sparse-compressed: only documents with a value are stored, and lookups are performed
* using binary search.
* <li>Const-compressed: when there is only one possible value, no per-document data is needed and
* this value is encoded alone.
* </ul>
* <p>
* {@link DocValuesType#BINARY BINARY}:
* <ul>
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
* for each document. The addresses are written as Monotonic-compressed numerics.
* <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
* completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed
@ -64,27 +89,21 @@ import org.apache.lucene.util.packed.DirectWriter;
* <p>
* {@link DocValuesType#SORTED SORTED}:
* <ul>
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Binary,
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-compressed Binary,
* along with the per-document ordinals written using one of the numeric strategies above.
* </ul>
* <p>
* {@link DocValuesType#SORTED_SET SORTED_SET}:
* <ul>
* <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
* <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
* an id, a lookup table is written and the mapping from document to set id is written using the
* numeric strategies above.
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
* an ordinal list and per-document index into this list are written using the numeric strategies
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
* an ordinal list and per-document index into this list are written using the numeric strategies
* above.
* </ul>
* <p>
* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
* <ul>
* <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
* <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
* an id, a lookup table is written and the mapping from document to set id is written using the
* numeric strategies above.
* <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
* strategies above.
* </ul>
@ -112,72 +131,30 @@ public final class Lucene70DocValuesFormat extends DocValuesFormat {
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return new Lucene70DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
}
static final String DATA_CODEC = "Lucene54DocValuesData";
static final String DATA_CODEC = "Lucene70DocValuesData";
static final String DATA_EXTENSION = "dvd";
static final String META_CODEC = "Lucene54DocValuesMetadata";
static final String META_CODEC = "Lucene70DocValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
// indicates docvalues type
static final byte NUMERIC = 0;
static final byte BINARY = 1;
static final byte SORTED = 2;
static final byte SORTED_SET = 3;
static final byte SORTED_NUMERIC = 4;
// address terms in blocks of 16 terms
static final int INTERVAL_SHIFT = 4;
static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT;
static final int INTERVAL_MASK = INTERVAL_COUNT - 1;
// build reverse index from every 1024th term
static final int REVERSE_INTERVAL_SHIFT = 10;
static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT;
static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1;
// for conversion from reverse index to block
static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT;
static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT;
static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1;
/** Compressed using packed blocks of ints. */
static final int DELTA_COMPRESSED = 0;
/** Compressed by computing the GCD. */
static final int GCD_COMPRESSED = 1;
/** Compressed by giving IDs to unique values. */
static final int TABLE_COMPRESSED = 2;
/** Compressed with monotonically increasing values */
static final int MONOTONIC_COMPRESSED = 3;
/** Compressed with constant value (uses only missing bitset) */
static final int CONST_COMPRESSED = 4;
/** Compressed with sparse arrays. */
static final int SPARSE_COMPRESSED = 5;
/** Uncompressed binary, written directly (fixed length). */
static final int BINARY_FIXED_UNCOMPRESSED = 0;
/** Uncompressed binary, written directly (variable length). */
static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
/** Compressed binary with shared prefixes */
static final int BINARY_PREFIX_COMPRESSED = 2;
/** Standard storage for sorted set values with 1 level of indirection:
* {@code docId -> address -> ord}. */
static final int SORTED_WITH_ADDRESSES = 0;
/** Single-valued sorted set values, encoded as sorted values, so no level
* of indirection: {@code docId -> ord}. */
static final int SORTED_SINGLE_VALUED = 1;
/** Compressed giving IDs to unique sets of values:
* {@code docId -> setId -> ords} */
static final int SORTED_SET_TABLE = 2;
/** placeholder for missing offset that means there are no missing values */
static final int ALL_LIVE = -1;
/** placeholder for missing offset that means all values are missing */
static final int ALL_MISSING = -2;
// addressing uses 16k blocks
static final int MONOTONIC_BLOCK_SIZE = 16384;
static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
static final int TERMS_DICT_BLOCK_SHIFT = 4;
static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;
static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10;
static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
}

View File

@ -44,12 +44,12 @@ public abstract class EmptyDocValuesProducer extends DocValuesProducer {
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) {
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) {
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
throw new UnsupportedOperationException();
}

View File

@ -17,6 +17,8 @@
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
/**
@ -103,7 +105,7 @@ public abstract class LegacySortedSetDocValues {
* Returns a {@link TermsEnum} over the values.
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
*/
public TermsEnum termsEnum() {
public TermsEnum termsEnum() throws IOException {
throw new UnsupportedOperationException();
}
}

View File

@ -95,7 +95,7 @@ final class SingletonSortedSetDocValues extends SortedSetDocValues {
}
@Override
public TermsEnum termsEnum() {
public TermsEnum termsEnum() throws IOException {
return in.termsEnum();
}

View File

@ -104,7 +104,7 @@ public abstract class SortedDocValues extends BinaryDocValues {
* Returns a {@link TermsEnum} over the values.
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
*/
public TermsEnum termsEnum() {
public TermsEnum termsEnum() throws IOException {
return new SortedDocValuesTermsEnum(this);
}
}

View File

@ -98,7 +98,7 @@ public abstract class SortedSetDocValues extends DocIdSetIterator {
* Returns a {@link TermsEnum} over the values.
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
*/
public TermsEnum termsEnum() {
public TermsEnum termsEnum() throws IOException {
return new SortedSetDocValuesTermsEnum(this);
}
}

View File

@ -81,7 +81,7 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
TermsEnum termsEnum = query.getTermsEnum(new Terms() {
@Override
public TermsEnum iterator() {
public TermsEnum iterator() throws IOException {
return fcsi.termsEnum();
}

View File

@ -46,7 +46,6 @@ public final class DirectMonotonicReader {
public static class Meta implements Accountable {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Meta.class);
final long numValues;
final int blockShift;
final int numBlocks;
final long[] mins;
@ -55,7 +54,6 @@ public final class DirectMonotonicReader {
final long[] offsets;
Meta(long numValues, int blockShift) {
this.numValues = numValues;
this.blockShift = blockShift;
long numBlocks = numValues >>> blockShift;
if ((numBlocks << blockShift) < numValues) {

View File

@ -25,14 +25,13 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Supplier;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValues;
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesProducer.SparseNumericDocValuesRandomAccessWrapper;
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
@ -62,7 +61,6 @@ import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMFile;
@ -70,7 +68,6 @@ import org.apache.lucene.store.RAMInputStream;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.TestUtil;
/**
@ -123,7 +120,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
public void testTermsEnumFixedWidth() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 10, 10);
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 10, 10));
}
}
@ -131,7 +128,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
public void testTermsEnumVariableWidth() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), 1, 500);
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 1, 500));
}
}
@ -139,7 +136,21 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
public void testTermsEnumRandomMany() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), 1, 500);
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), () -> TestUtil.randomSimpleString(random(), 1, 500));
}
}
public void testTermsEnumLongSharedPrefixes() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> {
char[] chars = new char[random().nextInt(500)];
Arrays.fill(chars, 'a');
if (chars.length > 0) {
chars[random().nextInt(chars.length)] = 'b';
}
return new String(chars);
});
}
}
@ -269,7 +280,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
@ -294,12 +305,11 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
Document doc = new Document();
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
doc.add(idField);
final int length = TestUtil.nextInt(random(), minLength, maxLength);
int numValues = random().nextInt(17);
// create a random list of strings
List<String> values = new ArrayList<>();
for (int v = 0; v < numValues; v++) {
values.add(TestUtil.randomSimpleString(random(), minLength, length));
values.add(valuesProducer.get());
}
// add in any order to the indexed field
@ -429,92 +439,6 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
}
}
public void testSparseLongValues() throws IOException {
final int iters = atLeast(5);
for (int iter = 0; iter < iters; ++iter) {
final int numDocs = TestUtil.nextInt(random(), 0, 100);
final int[] docIds = new int[numDocs];
final long[] values = new long[numDocs];
final int maxDoc;
if (numDocs == 0) {
maxDoc = 1 + random().nextInt(10);
} else {
docIds[0] = random().nextInt(10);
for (int i = 1; i < docIds.length; ++i) {
docIds[i] = docIds[i - 1] + 1 + random().nextInt(100);
}
maxDoc = docIds[numDocs - 1] + 1 + random().nextInt(10);
}
for (int i = 0; i < values.length; ++i) {
values[i] = random().nextLong();
}
final long missingValue = random().nextLong();
final LongValues docIdsValues = new LongValues() {
@Override
public long get(long index) {
return docIds[Math.toIntExact(index)];
}
};
final LongValues valuesValues = new LongValues() {
@Override
public long get(long index) {
return values[Math.toIntExact(index)];
}
};
final SparseNumericDocValues sparseValues = new SparseNumericDocValues(numDocs, docIdsValues, valuesValues);
// sequential access
assertEquals(-1, sparseValues.docID());
for (int i = 0; i < docIds.length; ++i) {
assertEquals(docIds[i], sparseValues.nextDoc());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.nextDoc());
// advance
for (int i = 0; i < 2000; ++i) {
final int target = TestUtil.nextInt(random(), 0, maxDoc);
int index = Arrays.binarySearch(docIds, target);
if (index < 0) {
index = -1 - index;
}
sparseValues.reset();
if (index > 0) {
assertEquals(docIds[index - 1], sparseValues.advance(Math.toIntExact(docIds[index - 1])));
}
if (index == docIds.length) {
assertEquals(DocIdSetIterator.NO_MORE_DOCS, sparseValues.advance(target));
} else {
assertEquals(docIds[index], sparseValues.advance(target));
}
}
final SparseNumericDocValuesRandomAccessWrapper raWrapper = new SparseNumericDocValuesRandomAccessWrapper(sparseValues, missingValue);
// random-access
for (int i = 0; i < 2000; ++i) {
final int docId = TestUtil.nextInt(random(), 0, maxDoc - 1);
final int idx = Arrays.binarySearch(docIds, docId);
final long value = raWrapper.get(docId);
if (idx >= 0) {
assertEquals(values[idx], value);
} else {
assertEquals(missingValue, value);
}
}
// sequential access
for (int docId = 0; docId < maxDoc; docId += random().nextInt(3)) {
final int idx = Arrays.binarySearch(docIds, docId);
final long value = raWrapper.get(docId);
if (idx >= 0) {
assertEquals(values[idx], value);
} else {
assertEquals(missingValue, value);
}
}
}
}
@Slow
public void testSortedSetAroundBlockSize() throws IOException {
final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;