From 6a481c0a5821b99e8a70fde79783541766830bb6 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 21 Jul 2015 07:36:38 +0000 Subject: [PATCH] LUCENE-6668: Added table encoding to sorted set/numeric doc values. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1692058 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../lucene50/Lucene50DocValuesConsumer.java | 145 +++++++++++-- .../lucene50/Lucene50DocValuesFormat.java | 42 +++- .../lucene50/Lucene50DocValuesProducer.java | 200 ++++++++++++++++-- .../lucene50/TestLucene50DocValuesFormat.java | 4 +- .../index/BaseDocValuesFormatTestCase.java | 91 ++++++-- 6 files changed, 429 insertions(+), 57 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 93756131afa..d8111456c4d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -352,6 +352,10 @@ Optimizations and TermsQuery. This should especially help when there are lots of small postings lists. (Adrien Grand, Mike McCandless) +* LUCENE-6668: Optimized storage for sorted set and sorted numeric doc values + in the case that there are few unique sets of values. + (Adrien Grand, Robert Muir) + Build * LUCENE-6518: Don't report false thread leaks from IBM J9 diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java index ab23cdd5e62..7c8782259f3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java @@ -23,6 +23,11 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; @@ -34,6 +39,7 @@ import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.MathUtil; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; @@ -463,11 +469,22 @@ class Lucene50DocValuesConsumer extends DocValuesConsumer implements Closeable { // The field is single-valued, we can encode it as NUMERIC addNumericField(field, singletonView(docToValueCount, values, null)); } else { - meta.writeVInt(SORTED_WITH_ADDRESSES); - // write the stream of values as a numeric field - addNumericField(field, values, true); - // write the doc -> ord count as a absolute index to the stream - addAddresses(field, docToValueCount); + final SortedSet uniqueValueSets = uniqueValueSets(docToValueCount, values); + if (uniqueValueSets != null) { + meta.writeVInt(SORTED_SET_TABLE); + + // write the set_id -> values mapping + writeDictionary(uniqueValueSets); + + // write the doc -> set_id as a numeric field + addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false); + } else { + meta.writeVInt(SORTED_WITH_ADDRESSES); + // write the stream of values as a numeric field + addNumericField(field, values, true); + // write the doc -> ord count as a absolute index to the stream + addAddresses(field, docToValueCount); + } } } @@ -481,20 +498,120 @@ class Lucene50DocValuesConsumer extends DocValuesConsumer implements Closeable { // The field is single-valued, we can encode it as SORTED addSortedField(field, values, singletonView(docToOrdCount, ords, -1L)); } else { - meta.writeVInt(SORTED_WITH_ADDRESSES); + final SortedSet uniqueValueSets = uniqueValueSets(docToOrdCount, ords); + if (uniqueValueSets != null) { + meta.writeVInt(SORTED_SET_TABLE); - // write the ord -> byte[] as a binary field - addTermsDict(field, values); + // write the set_id -> ords mapping + writeDictionary(uniqueValueSets); - // write the stream of ords as a numeric field - // NOTE: we could return an iterator that delta-encodes these within a doc - addNumericField(field, ords, false); + // write the ord -> byte[] as a binary field + addTermsDict(field, values); - // write the doc -> ord count as a absolute index to the stream - addAddresses(field, docToOrdCount); + // write the doc -> set_id as a numeric field + addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false); + } else { + meta.writeVInt(SORTED_WITH_ADDRESSES); + + // write the ord -> byte[] as a binary field + addTermsDict(field, values); + + // write the stream of ords as a numeric field + // NOTE: we could return an iterator that delta-encodes these within a doc + addNumericField(field, ords, false); + + // write the doc -> ord count as a absolute index to the stream + addAddresses(field, docToOrdCount); + } } } - + + private SortedSet uniqueValueSets(Iterable docToValueCount, Iterable values) { + Set uniqueValueSet = new HashSet<>(); + LongsRef docValues = new LongsRef(256); + + Iterator valueCountIterator = docToValueCount.iterator(); + Iterator valueIterator = values.iterator(); + int totalDictSize = 0; + while (valueCountIterator.hasNext()) { + docValues.length = valueCountIterator.next().intValue(); + if (docValues.length > 256) { + return null; + } + for (int i = 0; i < docValues.length; ++i) { + docValues.longs[i] = valueIterator.next().longValue(); + } + if (uniqueValueSet.contains(docValues)) { + continue; + } + totalDictSize += docValues.length; + if (totalDictSize > 256) { + return null; + } + uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length)); + } + assert valueIterator.hasNext() == false; + return new TreeSet<>(uniqueValueSet); + } + + private void writeDictionary(SortedSet uniqueValueSets) throws IOException { + int lengthSum = 0; + for (LongsRef longs : uniqueValueSets) { + lengthSum += longs.length; + } + + meta.writeInt(lengthSum); + for (LongsRef valueSet : uniqueValueSets) { + for (int i = 0; i < valueSet.length; ++i) { + meta.writeLong(valueSet.longs[valueSet.offset + i]); + } + } + + meta.writeInt(uniqueValueSets.size()); + for (LongsRef valueSet : uniqueValueSets) { + meta.writeInt(valueSet.length); + } + } + + private Iterable docToSetId(SortedSet uniqueValueSets, Iterable docToValueCount, Iterable values) { + final Map setIds = new HashMap<>(); + int i = 0; + for (LongsRef set : uniqueValueSets) { + setIds.put(set, i++); + } + assert i == uniqueValueSets.size(); + + return new Iterable() { + + @Override + public Iterator iterator() { + final Iterator valueCountIterator = docToValueCount.iterator(); + final Iterator valueIterator = values.iterator(); + final LongsRef docValues = new LongsRef(256); + return new Iterator() { + + @Override + public boolean hasNext() { + return valueCountIterator.hasNext(); + } + + @Override + public Number next() { + docValues.length = valueCountIterator.next().intValue(); + for (int i = 0; i < docValues.length; ++i) { + docValues.longs[i] = valueIterator.next().longValue(); + } + final Integer id = setIds.get(docValues); + assert id != null; + return id; + } + + }; + + } + }; + } + // writes addressing information as MONOTONIC_COMPRESSED integer private void addAddresses(FieldInfo field, Iterable values) throws IOException { meta.writeVInt(field.number); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java index 18d7c69b6ac..0220a1f83d9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java @@ -72,13 +72,21 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; *

* {@link DocValuesType#SORTED_SET SORTED_SET}: *

    + *
  • Single: if all documents have 0 or 1 value, then data are written like SORTED. + *
  • SortedSet table: when there are few unique sets of values (< 256) then each set is assigned + * an id, a lookup table is written and the mapping from document to set id is written using the + * numeric strategies above. *
  • SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, * an ordinal list and per-document index into this list are written using the numeric strategies - * above. + * above. *
*

* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}: *

    + *
  • Single: if all documents have 0 or 1 value, then data are written like NUMERIC. + *
  • SortedSet table: when there are few unique sets of values (< 256) then each set is assigned + * an id, a lookup table is written and the mapping from document to set id is written using the + * numeric strategies above. *
  • SortedNumeric: a value list and per-document index into this list are written using the numeric * strategies above. *
@@ -108,21 +116,24 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; *
  • PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize
  • *
  • BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset
  • *
  • SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry
  • - *
  • SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry
  • - *
  • SortedNumericEntry --> EntryType,NumericEntry,NumericEntry
  • + *
  • SortedSetEntry --> SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry
  • + *
  • SingleSortedSetEntry --> SetHeader,SortedEntry
  • + *
  • AddressesSortedSetEntry --> SetHeader,BinaryEntry,NumericEntry,NumericEntry
  • + *
  • TableSortedSetEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}TotalTableLength,TableSize,{@link DataOutput#writeInt Int32}TableSize,BinaryEntry,NumericEntry
  • + *
  • SetHeader --> FieldNumber,EntryType,SetType
  • + *
  • SortedNumericEntry --> SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry
  • + *
  • SingleNumericEntry --> SetHeader,NumericEntry
  • + *
  • AddressesSortedNumericEntry --> SetHeader,NumericEntry,NumericEntry
  • + *
  • TableSortedNumericEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}TotalTableLength,TableSize,{@link DataOutput#writeInt Int32}TableSize,NumericEntry
  • *
  • FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
  • *
  • EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
  • *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • *
  • MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --> {@link DataOutput#writeLong Int64}
  • - *
  • TableSize,BitsPerValue --> {@link DataOutput#writeVInt vInt}
  • + *
  • TableSize,BitsPerValue,TotalTableLength --> {@link DataOutput#writeVInt vInt}
  • *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • * *

    Sorted fields have two entries: a BinaryEntry with the value metadata, * and an ordinary NumericEntry for the document-to-ord metadata.

    - *

    SortedSet fields have three entries: a BinaryEntry with the value metadata, - * and two NumericEntries for the document-to-ord-index and ordinal list metadata.

    - *

    SortedNumeric fields have two entries: A NumericEntry with the value metadata, - * and a numeric entry with the document-to-value index.

    *

    FieldNumber of -1 indicates the end of metadata.

    *

    EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)

    *

    DataOffset is the pointer to the start of the data in the DocValues data (.dvd)

    @@ -144,6 +155,15 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; *
  • 1 --> variable-width. An address for each value is stored. *
  • 2 --> prefix-compressed. An address to the start of every interval'th value is stored. * + *

    SetType indicates how SortedSet and SortedNumeric values will be stored: + *

      + *
    • 0 --> with addresses. There are two numeric entries: a first one from document to start + * offset, and a second one from offset to ord/value. + *
    • 1 --> single-valued. Used when all documents have at most one value and is encoded like + * a regular Sorted/Numeric entry. + *
    • 2 --> table-encoded. A lookup table of unique sets of values is written, followed by a + * numeric entry that maps each document to an ordinal in this table. + *
    *

    MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) @@ -187,7 +207,8 @@ public final class Lucene50DocValuesFormat extends DocValuesFormat { static final String META_CODEC = "Lucene50DocValuesMetadata"; static final String META_EXTENSION = "dvm"; static final int VERSION_START = 0; - static final int VERSION_CURRENT = VERSION_START; + static final int VERSION_SORTEDSET_TABLE = 1; + static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE; // indicates docvalues type static final byte NUMERIC = 0; @@ -235,6 +256,9 @@ public final class Lucene50DocValuesFormat extends DocValuesFormat { /** Single-valued sorted set values, encoded as sorted values, so no level * of indirection: {@code docId -> ord}. */ static final int SORTED_SINGLE_VALUED = 1; + /** Compressed giving IDs to unique sets of values: + * {@code docId -> setId -> ords} */ + static final int SORTED_SET_TABLE = 2; /** placeholder for missing offset that means there are no missing values */ static final int ALL_LIVE = -1; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java index 3e6b16a59b6..596034bbc8d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java @@ -206,6 +206,28 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { ordIndexes.put(info.name, n2); } + private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException { + // sortedset table = binary + ordset table + ordset index + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.BINARY) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + + BinaryEntry b = readBinaryEntry(meta); + binaries.put(info.name, b); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n = readNumericEntry(meta); + ords.put(info.name, n); + } + private int readFields(IndexInput meta, FieldInfos infos) throws IOException { int numFields = 0; int fieldNumber = meta.readVInt(); @@ -229,6 +251,8 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { sortedSets.put(info.name, ss); if (ss.format == SORTED_WITH_ADDRESSES) { readSortedSetFieldWithAddresses(info, meta); + } else if (ss.format == SORTED_SET_TABLE) { + readSortedSetFieldWithTable(info, meta); } else if (ss.format == SORTED_SINGLE_VALUED) { if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); @@ -243,14 +267,14 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { } else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) { SortedSetEntry ss = readSortedSetEntry(meta); sortedNumerics.put(info.name, ss); - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); - } - numerics.put(info.name, readNumericEntry(meta)); if (ss.format == SORTED_WITH_ADDRESSES) { + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + numerics.put(info.name, readNumericEntry(meta)); if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); } @@ -259,7 +283,24 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { } NumericEntry ordIndex = readNumericEntry(meta); ordIndexes.put(info.name, ordIndex); - } else if (ss.format != SORTED_SINGLE_VALUED) { + } else if (ss.format == SORTED_SET_TABLE) { + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n = readNumericEntry(meta); + ords.put(info.name, n); + } else if (ss.format == SORTED_SINGLE_VALUED) { + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + numerics.put(info.name, readNumericEntry(meta)); + } else { throw new AssertionError(); } } else { @@ -346,7 +387,24 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException { SortedSetEntry entry = new SortedSetEntry(); entry.format = meta.readVInt(); - if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) { + if (entry.format == SORTED_SET_TABLE) { + final int totalTableLength = meta.readInt(); + if (totalTableLength > 256) { + throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta); + } + entry.table = new long[totalTableLength]; + for (int i = 0; i < totalTableLength; ++i) { + entry.table[i] = meta.readLong(); + } + final int tableSize = meta.readInt(); + if (tableSize > totalTableLength + 1) { // +1 because of the empty set + throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta); + } + entry.tableOffsets = new int[tableSize + 1]; + for (int i = 1; i < entry.tableOffsets.length; ++i) { + entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt(); + } + } else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) { throw new CorruptIndexException("Unknown format: " + entry.format, meta); } return entry; @@ -611,12 +669,14 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { SortedSetEntry ss = sortedNumerics.get(field.name); - NumericEntry numericEntry = numerics.get(field.name); - final LongValues values = getNumeric(numericEntry); if (ss.format == SORTED_SINGLE_VALUED) { + NumericEntry numericEntry = numerics.get(field.name); + final LongValues values = getNumeric(numericEntry); final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc); return DocValues.singleton(values, docsWithField); } else if (ss.format == SORTED_WITH_ADDRESSES) { + NumericEntry numericEntry = numerics.get(field.name); + final LongValues values = getNumeric(numericEntry); final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); return new SortedNumericDocValues() { @@ -639,6 +699,33 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { return (int) (endOffset - startOffset); } }; + } else if (ss.format == SORTED_SET_TABLE) { + NumericEntry entry = ords.get(field.name); + final LongValues ordinals = getNumeric(entry); + + final long[] table = ss.table; + final int[] offsets = ss.tableOffsets; + return new SortedNumericDocValues() { + int startOffset; + int endOffset; + + @Override + public void setDocument(int doc) { + final int ord = (int) ordinals.get(doc); + startOffset = offsets[ord]; + endOffset = offsets[ord + 1]; + } + + @Override + public long valueAt(int index) { + return table[startOffset + index]; + } + + @Override + public int count() { + return endOffset - startOffset; + } + }; } else { throw new AssertionError(); } @@ -647,13 +734,20 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { SortedSetEntry ss = sortedSets.get(field.name); - if (ss.format == SORTED_SINGLE_VALUED) { - final SortedDocValues values = getSorted(field); - return DocValues.singleton(values); - } else if (ss.format != SORTED_WITH_ADDRESSES) { - throw new AssertionError(); + switch (ss.format) { + case SORTED_SINGLE_VALUED: + final SortedDocValues values = getSorted(field); + return DocValues.singleton(values); + case SORTED_WITH_ADDRESSES: + return getSortedSetWithAddresses(field); + case SORTED_SET_TABLE: + return getSortedSetTable(field, ss); + default: + throw new AssertionError(); } + } + private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException { final long valueCount = binaries.get(field.name).count; // we keep the byte[]s and list of ords on disk, these could be large final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); @@ -722,7 +816,76 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { } }; } - + + private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException { + final long valueCount = binaries.get(field.name).count; + final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); + final LongValues ordinals = getNumeric(ords.get(field.name)); + + final long[] table = ss.table; + final int[] offsets = ss.tableOffsets; + + return new RandomAccessOrds() { + + int offset, startOffset, endOffset; + + @Override + public void setDocument(int docID) { + final int ord = (int) ordinals.get(docID); + offset = startOffset = offsets[ord]; + endOffset = offsets[ord + 1]; + } + + @Override + public long ordAt(int index) { + return table[startOffset + index]; + } + + @Override + public long nextOrd() { + if (offset == endOffset) { + return NO_MORE_ORDS; + } else { + return table[offset++]; + } + } + + @Override + public int cardinality() { + return endOffset - startOffset; + } + + @Override + public BytesRef lookupOrd(long ord) { + return binary.get(ord); + } + + @Override + public long getValueCount() { + return valueCount; + } + + @Override + public long lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues) binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues) binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + + }; + } + private Bits getLiveBits(final long offset, final int count) throws IOException { if (offset == ALL_MISSING) { return new Bits.MatchNoBits(count); @@ -831,6 +994,9 @@ class Lucene50DocValuesProducer extends DocValuesProducer implements Closeable { static class SortedSetEntry { private SortedSetEntry() {} int format; + + long[] table; + int[] tableOffsets; } // internally we compose complex dv (sorted/sortedset) from other ones diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java index 7adfe1ba486..0112ecb431e 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java @@ -64,7 +64,7 @@ public class TestLucene50DocValuesFormat extends BaseCompressingDocValuesFormatT public void testSortedSetVariableLengthBigVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16); + doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100); } } @@ -72,7 +72,7 @@ public class TestLucene50DocValuesFormat extends BaseCompressingDocValuesFormatT public void testSortedSetVariableLengthManyVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16); + doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index c9041a678d0..4abe979e29f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -62,6 +63,8 @@ import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.TestUtil; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; + import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; /** @@ -1940,29 +1943,30 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes directory.close(); } - protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc) throws Exception { + protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) throws Exception { Directory dir = newFSDirectory(createTempDir("dvduel")); IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - + + Set valueSet = new HashSet(); + for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) { + final int length = TestUtil.nextInt(random(), minLength, maxLength); + valueSet.add(TestUtil.randomSimpleString(random(), length)); + } + String[] uniqueValues = valueSet.toArray(new String[0]); + // index some docs for (int i = 0; i < numDocs; i++) { Document doc = new Document(); Field idField = new StringField("id", Integer.toString(i), Field.Store.NO); doc.add(idField); - final int length; - if (minLength == maxLength) { - length = minLength; // fixed length - } else { - length = TestUtil.nextInt(random(), minLength, maxLength); - } int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc); // create a random set of strings Set values = new TreeSet<>(); for (int v = 0; v < numValues; v++) { - values.add(TestUtil.randomSimpleString(random(), length)); + values.add(RandomPicks.randomFrom(random(), uniqueValues)); } - + // add ordered to the stored field for (String v : values) { doc.add(new StoredField("stored", v)); @@ -2041,7 +2045,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { int fixedLength = TestUtil.nextInt(random(), 1, 10); - doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16); + doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100); } } @@ -2107,12 +2111,37 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes ); } } - + + public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception { + assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric()); + final long[] values = new long[TestUtil.nextInt(random(), 2, 6)]; + for (int i = 0; i < values.length; ++i) { + values[i] = random().nextLong(); + } + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedNumericsVsStoredFields( + new LongProducer() { + @Override + long next() { + return TestUtil.nextLong(random(), 0, 6); + } + }, + new LongProducer() { + @Override + long next() { + return values[random().nextInt(values.length)]; + } + } + ); + } + } + public void testSortedSetVariableLengthVsStoredFields() throws Exception { assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16); + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100); } } @@ -2121,7 +2150,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { int fixedLength = TestUtil.nextInt(random(), 1, 10); - doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1); + doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100); } } @@ -2129,7 +2158,39 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1); + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100); + } + } + + public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception { + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6); + } + } + + public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception { + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6); + } + } + + public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception { + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000); + } + } + + public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception { + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000); } }