LUCENE-6668: Added table encoding to sorted set/numeric doc values.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1692058 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-07-21 07:36:38 +00:00
parent 983d9efe76
commit 6a481c0a58
6 changed files with 429 additions and 57 deletions

View File

@ -352,6 +352,10 @@ Optimizations
and TermsQuery. This should especially help when there are lots of small and TermsQuery. This should especially help when there are lots of small
postings lists. (Adrien Grand, Mike McCandless) postings lists. (Adrien Grand, Mike McCandless)
* LUCENE-6668: Optimized storage for sorted set and sorted numeric doc values
in the case that there are few unique sets of values.
(Adrien Grand, Robert Muir)
Build Build
* LUCENE-6518: Don't report false thread leaks from IBM J9 * LUCENE-6518: Don't report false thread leaks from IBM J9

View File

@ -23,6 +23,11 @@ import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesConsumer;
@ -34,6 +39,7 @@ import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.MathUtil; import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
@ -463,11 +469,22 @@ class Lucene50DocValuesConsumer extends DocValuesConsumer implements Closeable {
// The field is single-valued, we can encode it as NUMERIC // The field is single-valued, we can encode it as NUMERIC
addNumericField(field, singletonView(docToValueCount, values, null)); addNumericField(field, singletonView(docToValueCount, values, null));
} else { } else {
meta.writeVInt(SORTED_WITH_ADDRESSES); final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToValueCount, values);
// write the stream of values as a numeric field if (uniqueValueSets != null) {
addNumericField(field, values, true); meta.writeVInt(SORTED_SET_TABLE);
// write the doc -> ord count as a absolute index to the stream
addAddresses(field, docToValueCount); // write the set_id -> values mapping
writeDictionary(uniqueValueSets);
// write the doc -> set_id as a numeric field
addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
} else {
meta.writeVInt(SORTED_WITH_ADDRESSES);
// write the stream of values as a numeric field
addNumericField(field, values, true);
// write the doc -> ord count as a absolute index to the stream
addAddresses(field, docToValueCount);
}
} }
} }
@ -481,20 +498,120 @@ class Lucene50DocValuesConsumer extends DocValuesConsumer implements Closeable {
// The field is single-valued, we can encode it as SORTED // The field is single-valued, we can encode it as SORTED
addSortedField(field, values, singletonView(docToOrdCount, ords, -1L)); addSortedField(field, values, singletonView(docToOrdCount, ords, -1L));
} else { } else {
meta.writeVInt(SORTED_WITH_ADDRESSES); final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToOrdCount, ords);
if (uniqueValueSets != null) {
meta.writeVInt(SORTED_SET_TABLE);
// write the ord -> byte[] as a binary field // write the set_id -> ords mapping
addTermsDict(field, values); writeDictionary(uniqueValueSets);
// write the stream of ords as a numeric field // write the ord -> byte[] as a binary field
// NOTE: we could return an iterator that delta-encodes these within a doc addTermsDict(field, values);
addNumericField(field, ords, false);
// write the doc -> ord count as a absolute index to the stream // write the doc -> set_id as a numeric field
addAddresses(field, docToOrdCount); addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
} else {
meta.writeVInt(SORTED_WITH_ADDRESSES);
// write the ord -> byte[] as a binary field
addTermsDict(field, values);
// write the stream of ords as a numeric field
// NOTE: we could return an iterator that delta-encodes these within a doc
addNumericField(field, ords, false);
// write the doc -> ord count as a absolute index to the stream
addAddresses(field, docToOrdCount);
}
} }
} }
private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) {
Set<LongsRef> uniqueValueSet = new HashSet<>();
LongsRef docValues = new LongsRef(256);
Iterator<Number> valueCountIterator = docToValueCount.iterator();
Iterator<Number> valueIterator = values.iterator();
int totalDictSize = 0;
while (valueCountIterator.hasNext()) {
docValues.length = valueCountIterator.next().intValue();
if (docValues.length > 256) {
return null;
}
for (int i = 0; i < docValues.length; ++i) {
docValues.longs[i] = valueIterator.next().longValue();
}
if (uniqueValueSet.contains(docValues)) {
continue;
}
totalDictSize += docValues.length;
if (totalDictSize > 256) {
return null;
}
uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length));
}
assert valueIterator.hasNext() == false;
return new TreeSet<>(uniqueValueSet);
}
private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException {
int lengthSum = 0;
for (LongsRef longs : uniqueValueSets) {
lengthSum += longs.length;
}
meta.writeInt(lengthSum);
for (LongsRef valueSet : uniqueValueSets) {
for (int i = 0; i < valueSet.length; ++i) {
meta.writeLong(valueSet.longs[valueSet.offset + i]);
}
}
meta.writeInt(uniqueValueSets.size());
for (LongsRef valueSet : uniqueValueSets) {
meta.writeInt(valueSet.length);
}
}
private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, Iterable<Number> docToValueCount, Iterable<Number> values) {
final Map<LongsRef, Integer> setIds = new HashMap<>();
int i = 0;
for (LongsRef set : uniqueValueSets) {
setIds.put(set, i++);
}
assert i == uniqueValueSets.size();
return new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
final Iterator<Number> valueCountIterator = docToValueCount.iterator();
final Iterator<Number> valueIterator = values.iterator();
final LongsRef docValues = new LongsRef(256);
return new Iterator<Number>() {
@Override
public boolean hasNext() {
return valueCountIterator.hasNext();
}
@Override
public Number next() {
docValues.length = valueCountIterator.next().intValue();
for (int i = 0; i < docValues.length; ++i) {
docValues.longs[i] = valueIterator.next().longValue();
}
final Integer id = setIds.get(docValues);
assert id != null;
return id;
}
};
}
};
}
// writes addressing information as MONOTONIC_COMPRESSED integer // writes addressing information as MONOTONIC_COMPRESSED integer
private void addAddresses(FieldInfo field, Iterable<Number> values) throws IOException { private void addAddresses(FieldInfo field, Iterable<Number> values) throws IOException {
meta.writeVInt(field.number); meta.writeVInt(field.number);

View File

@ -72,13 +72,21 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
* <p> * <p>
* {@link DocValuesType#SORTED_SET SORTED_SET}: * {@link DocValuesType#SORTED_SET SORTED_SET}:
* <ul> * <ul>
* <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
* <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
* an id, a lookup table is written and the mapping from document to set id is written using the
* numeric strategies above.
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, * <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
* an ordinal list and per-document index into this list are written using the numeric strategies * an ordinal list and per-document index into this list are written using the numeric strategies
* above. * above.
* </ul> * </ul>
* <p> * <p>
* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}: * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
* <ul> * <ul>
* <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
* <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
* an id, a lookup table is written and the mapping from document to set id is written using the
* numeric strategies above.
* <li>SortedNumeric: a value list and per-document index into this list are written using the numeric * <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
* strategies above. * strategies above.
* </ul> * </ul>
@ -108,21 +116,24 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
* <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li> * <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
* <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li> * <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
* <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li> * <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
* <li>SortedSetEntry --&gt; EntryType,BinaryEntry,NumericEntry,NumericEntry</li> * <li>SortedSetEntry --&gt; SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li>
* <li>SortedNumericEntry --&gt; EntryType,NumericEntry,NumericEntry</li> * <li>SingleSortedSetEntry --&gt; SetHeader,SortedEntry</li>
* <li>AddressesSortedSetEntry --&gt; SetHeader,BinaryEntry,NumericEntry,NumericEntry</li>
* <li>TableSortedSetEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li>
* <li>SetHeader --&gt; FieldNumber,EntryType,SetType</li>
* <li>SortedNumericEntry --&gt; SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li>
* <li>SingleNumericEntry --&gt; SetHeader,NumericEntry</li>
* <li>AddressesSortedNumericEntry --&gt; SetHeader,NumericEntry,NumericEntry</li>
* <li>TableSortedNumericEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li>
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li> * <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li> * <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li> * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --&gt; {@link DataOutput#writeLong Int64}</li> * <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --&gt; {@link DataOutput#writeLong Int64}</li>
* <li>TableSize,BitsPerValue --&gt; {@link DataOutput#writeVInt vInt}</li> * <li>TableSize,BitsPerValue,TotalTableLength --&gt; {@link DataOutput#writeVInt vInt}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li> * <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul> * </ul>
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata, * <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
* and an ordinary NumericEntry for the document-to-ord metadata.</p> * and an ordinary NumericEntry for the document-to-ord metadata.</p>
* <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
* and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
* <p>SortedNumeric fields have two entries: A NumericEntry with the value metadata,
* and a numeric entry with the document-to-value index.</p>
* <p>FieldNumber of -1 indicates the end of metadata.</p> * <p>FieldNumber of -1 indicates the end of metadata.</p>
* <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p> * <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p> * <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
@ -144,6 +155,15 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
* <li>1 --&gt; variable-width. An address for each value is stored. * <li>1 --&gt; variable-width. An address for each value is stored.
* <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored. * <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
* </ul> * </ul>
* <p>SetType indicates how SortedSet and SortedNumeric values will be stored:
* <ul>
* <li>0 --&gt; with addresses. There are two numeric entries: a first one from document to start
* offset, and a second one from offset to ord/value.
* <li>1 --&gt; single-valued. Used when all documents have at most one value and is encoded like
* a regular Sorted/Numeric entry.
* <li>2 --&gt; table-encoded. A lookup table of unique sets of values is written, followed by a
* numeric entry that maps each document to an ordinal in this table.
* </ul>
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. * <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
@ -187,7 +207,8 @@ public final class Lucene50DocValuesFormat extends DocValuesFormat {
static final String META_CODEC = "Lucene50DocValuesMetadata"; static final String META_CODEC = "Lucene50DocValuesMetadata";
static final String META_EXTENSION = "dvm"; static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0; static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START; static final int VERSION_SORTEDSET_TABLE = 1;
static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;
// indicates docvalues type // indicates docvalues type
static final byte NUMERIC = 0; static final byte NUMERIC = 0;
@ -235,6 +256,9 @@ public final class Lucene50DocValuesFormat extends DocValuesFormat {
/** Single-valued sorted set values, encoded as sorted values, so no level /** Single-valued sorted set values, encoded as sorted values, so no level
* of indirection: {@code docId -> ord}. */ * of indirection: {@code docId -> ord}. */
static final int SORTED_SINGLE_VALUED = 1; static final int SORTED_SINGLE_VALUED = 1;
/** Compressed giving IDs to unique sets of values:
* {@code docId -> setId -> ords} */
static final int SORTED_SET_TABLE = 2;
/** placeholder for missing offset that means there are no missing values */ /** placeholder for missing offset that means there are no missing values */
static final int ALL_LIVE = -1; static final int ALL_LIVE = -1;

View File

@ -206,6 +206,28 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
ordIndexes.put(info.name, n2); ordIndexes.put(info.name, n2);
} }
private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException {
// sortedset table = binary + ordset table + ordset index
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene50DocValuesFormat.BINARY) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
BinaryEntry b = readBinaryEntry(meta);
binaries.put(info.name, b);
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry n = readNumericEntry(meta);
ords.put(info.name, n);
}
private int readFields(IndexInput meta, FieldInfos infos) throws IOException { private int readFields(IndexInput meta, FieldInfos infos) throws IOException {
int numFields = 0; int numFields = 0;
int fieldNumber = meta.readVInt(); int fieldNumber = meta.readVInt();
@ -229,6 +251,8 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
sortedSets.put(info.name, ss); sortedSets.put(info.name, ss);
if (ss.format == SORTED_WITH_ADDRESSES) { if (ss.format == SORTED_WITH_ADDRESSES) {
readSortedSetFieldWithAddresses(info, meta); readSortedSetFieldWithAddresses(info, meta);
} else if (ss.format == SORTED_SET_TABLE) {
readSortedSetFieldWithTable(info, meta);
} else if (ss.format == SORTED_SINGLE_VALUED) { } else if (ss.format == SORTED_SINGLE_VALUED) {
if (meta.readVInt() != fieldNumber) { if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
@ -243,14 +267,14 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
} else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) { } else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) {
SortedSetEntry ss = readSortedSetEntry(meta); SortedSetEntry ss = readSortedSetEntry(meta);
sortedNumerics.put(info.name, ss); sortedNumerics.put(info.name, ss);
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
numerics.put(info.name, readNumericEntry(meta));
if (ss.format == SORTED_WITH_ADDRESSES) { if (ss.format == SORTED_WITH_ADDRESSES) {
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
numerics.put(info.name, readNumericEntry(meta));
if (meta.readVInt() != fieldNumber) { if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
} }
@ -259,7 +283,24 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
} }
NumericEntry ordIndex = readNumericEntry(meta); NumericEntry ordIndex = readNumericEntry(meta);
ordIndexes.put(info.name, ordIndex); ordIndexes.put(info.name, ordIndex);
} else if (ss.format != SORTED_SINGLE_VALUED) { } else if (ss.format == SORTED_SET_TABLE) {
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry n = readNumericEntry(meta);
ords.put(info.name, n);
} else if (ss.format == SORTED_SINGLE_VALUED) {
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
numerics.put(info.name, readNumericEntry(meta));
} else {
throw new AssertionError(); throw new AssertionError();
} }
} else { } else {
@ -346,7 +387,24 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException { SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException {
SortedSetEntry entry = new SortedSetEntry(); SortedSetEntry entry = new SortedSetEntry();
entry.format = meta.readVInt(); entry.format = meta.readVInt();
if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) { if (entry.format == SORTED_SET_TABLE) {
final int totalTableLength = meta.readInt();
if (totalTableLength > 256) {
throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta);
}
entry.table = new long[totalTableLength];
for (int i = 0; i < totalTableLength; ++i) {
entry.table[i] = meta.readLong();
}
final int tableSize = meta.readInt();
if (tableSize > totalTableLength + 1) { // +1 because of the empty set
throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta);
}
entry.tableOffsets = new int[tableSize + 1];
for (int i = 1; i < entry.tableOffsets.length; ++i) {
entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt();
}
} else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
throw new CorruptIndexException("Unknown format: " + entry.format, meta); throw new CorruptIndexException("Unknown format: " + entry.format, meta);
} }
return entry; return entry;
@ -611,12 +669,14 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
@Override @Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedSetEntry ss = sortedNumerics.get(field.name); SortedSetEntry ss = sortedNumerics.get(field.name);
NumericEntry numericEntry = numerics.get(field.name);
final LongValues values = getNumeric(numericEntry);
if (ss.format == SORTED_SINGLE_VALUED) { if (ss.format == SORTED_SINGLE_VALUED) {
NumericEntry numericEntry = numerics.get(field.name);
final LongValues values = getNumeric(numericEntry);
final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc); final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
return DocValues.singleton(values, docsWithField); return DocValues.singleton(values, docsWithField);
} else if (ss.format == SORTED_WITH_ADDRESSES) { } else if (ss.format == SORTED_WITH_ADDRESSES) {
NumericEntry numericEntry = numerics.get(field.name);
final LongValues values = getNumeric(numericEntry);
final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name));
return new SortedNumericDocValues() { return new SortedNumericDocValues() {
@ -639,6 +699,33 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
return (int) (endOffset - startOffset); return (int) (endOffset - startOffset);
} }
}; };
} else if (ss.format == SORTED_SET_TABLE) {
NumericEntry entry = ords.get(field.name);
final LongValues ordinals = getNumeric(entry);
final long[] table = ss.table;
final int[] offsets = ss.tableOffsets;
return new SortedNumericDocValues() {
int startOffset;
int endOffset;
@Override
public void setDocument(int doc) {
final int ord = (int) ordinals.get(doc);
startOffset = offsets[ord];
endOffset = offsets[ord + 1];
}
@Override
public long valueAt(int index) {
return table[startOffset + index];
}
@Override
public int count() {
return endOffset - startOffset;
}
};
} else { } else {
throw new AssertionError(); throw new AssertionError();
} }
@ -647,13 +734,20 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
@Override @Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry ss = sortedSets.get(field.name); SortedSetEntry ss = sortedSets.get(field.name);
if (ss.format == SORTED_SINGLE_VALUED) { switch (ss.format) {
final SortedDocValues values = getSorted(field); case SORTED_SINGLE_VALUED:
return DocValues.singleton(values); final SortedDocValues values = getSorted(field);
} else if (ss.format != SORTED_WITH_ADDRESSES) { return DocValues.singleton(values);
throw new AssertionError(); case SORTED_WITH_ADDRESSES:
return getSortedSetWithAddresses(field);
case SORTED_SET_TABLE:
return getSortedSetTable(field, ss);
default:
throw new AssertionError();
} }
}
private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException {
final long valueCount = binaries.get(field.name).count; final long valueCount = binaries.get(field.name).count;
// we keep the byte[]s and list of ords on disk, these could be large // we keep the byte[]s and list of ords on disk, these could be large
final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
@ -722,7 +816,76 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
} }
}; };
} }
private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException {
final long valueCount = binaries.get(field.name).count;
final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
final LongValues ordinals = getNumeric(ords.get(field.name));
final long[] table = ss.table;
final int[] offsets = ss.tableOffsets;
return new RandomAccessOrds() {
int offset, startOffset, endOffset;
@Override
public void setDocument(int docID) {
final int ord = (int) ordinals.get(docID);
offset = startOffset = offsets[ord];
endOffset = offsets[ord + 1];
}
@Override
public long ordAt(int index) {
return table[startOffset + index];
}
@Override
public long nextOrd() {
if (offset == endOffset) {
return NO_MORE_ORDS;
} else {
return table[offset++];
}
}
@Override
public int cardinality() {
return endOffset - startOffset;
}
@Override
public BytesRef lookupOrd(long ord) {
return binary.get(ord);
}
@Override
public long getValueCount() {
return valueCount;
}
@Override
public long lookupTerm(BytesRef key) {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues) binary).lookupTerm(key);
} else {
return super.lookupTerm(key);
}
}
@Override
public TermsEnum termsEnum() {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues) binary).getTermsEnum();
} else {
return super.termsEnum();
}
}
};
}
private Bits getLiveBits(final long offset, final int count) throws IOException { private Bits getLiveBits(final long offset, final int count) throws IOException {
if (offset == ALL_MISSING) { if (offset == ALL_MISSING) {
return new Bits.MatchNoBits(count); return new Bits.MatchNoBits(count);
@ -831,6 +994,9 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable {
static class SortedSetEntry { static class SortedSetEntry {
private SortedSetEntry() {} private SortedSetEntry() {}
int format; int format;
long[] table;
int[] tableOffsets;
} }
// internally we compose complex dv (sorted/sortedset) from other ones // internally we compose complex dv (sorted/sortedset) from other ones

View File

@ -64,7 +64,7 @@ public class TestLucene50DocValuesFormat extends BaseCompressingDocValuesFormatT
public void testSortedSetVariableLengthBigVsStoredFields() throws Exception { public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
int numIterations = atLeast(1); int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) { for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16); doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
} }
} }
@ -72,7 +72,7 @@ public class TestLucene50DocValuesFormat extends BaseCompressingDocValuesFormatT
public void testSortedSetVariableLengthManyVsStoredFields() throws Exception { public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
int numIterations = atLeast(1); int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) { for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16); doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
} }
} }

View File

@ -24,6 +24,7 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
@ -62,6 +63,8 @@ import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
/** /**
@ -1940,29 +1943,30 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
directory.close(); directory.close();
} }
protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc) throws Exception { protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) throws Exception {
Directory dir = newFSDirectory(createTempDir("dvduel")); Directory dir = newFSDirectory(createTempDir("dvduel"));
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
Set<String> valueSet = new HashSet<String>();
for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) {
final int length = TestUtil.nextInt(random(), minLength, maxLength);
valueSet.add(TestUtil.randomSimpleString(random(), length));
}
String[] uniqueValues = valueSet.toArray(new String[0]);
// index some docs // index some docs
for (int i = 0; i < numDocs; i++) { for (int i = 0; i < numDocs; i++) {
Document doc = new Document(); Document doc = new Document();
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO); Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
doc.add(idField); doc.add(idField);
final int length;
if (minLength == maxLength) {
length = minLength; // fixed length
} else {
length = TestUtil.nextInt(random(), minLength, maxLength);
}
int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc); int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc);
// create a random set of strings // create a random set of strings
Set<String> values = new TreeSet<>(); Set<String> values = new TreeSet<>();
for (int v = 0; v < numValues; v++) { for (int v = 0; v < numValues; v++) {
values.add(TestUtil.randomSimpleString(random(), length)); values.add(RandomPicks.randomFrom(random(), uniqueValues));
} }
// add ordered to the stored field // add ordered to the stored field
for (String v : values) { for (String v : values) {
doc.add(new StoredField("stored", v)); doc.add(new StoredField("stored", v));
@ -2041,7 +2045,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
int numIterations = atLeast(1); int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) { for (int i = 0; i < numIterations; i++) {
int fixedLength = TestUtil.nextInt(random(), 1, 10); int fixedLength = TestUtil.nextInt(random(), 1, 10);
doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16); doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100);
} }
} }
@ -2107,12 +2111,37 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
); );
} }
} }
public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception {
assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric());
final long[] values = new long[TestUtil.nextInt(random(), 2, 6)];
for (int i = 0; i < values.length; ++i) {
values[i] = random().nextLong();
}
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestSortedNumericsVsStoredFields(
new LongProducer() {
@Override
long next() {
return TestUtil.nextLong(random(), 0, 6);
}
},
new LongProducer() {
@Override
long next() {
return values[random().nextInt(values.length)];
}
}
);
}
}
public void testSortedSetVariableLengthVsStoredFields() throws Exception { public void testSortedSetVariableLengthVsStoredFields() throws Exception {
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1); int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) { for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16); doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100);
} }
} }
@ -2121,7 +2150,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
int numIterations = atLeast(1); int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) { for (int i = 0; i < numIterations; i++) {
int fixedLength = TestUtil.nextInt(random(), 1, 10); int fixedLength = TestUtil.nextInt(random(), 1, 10);
doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1); doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100);
} }
} }
@ -2129,7 +2158,39 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1); int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) { for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1); doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100);
}
}
public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception {
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6);
}
}
public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception {
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6);
}
}
public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception {
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000);
}
}
public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception {
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000);
} }
} }