mirror of https://github.com/apache/lucene.git
LUCENE-9663: Add compression to terms dict from SortedSet/Sorted DocValues.
Closes #2302
This commit is contained in:
parent
227ef3b397
commit
5856c0f176
|
@ -186,6 +186,9 @@ Improvements
|
|||
* LUCENE-9674: Implement faster advance on VectorValues using binary search.
|
||||
(Anand Kotriwal, Mike Sokolov)
|
||||
|
||||
* LUCENE-9663: Adding compression to terms dict from SortedSet/Sorted DocValues.
|
||||
(Jaison Bi via Bruno Roustant)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen,
|
||||
|
|
|
@ -43,9 +43,11 @@ import org.apache.lucene.index.SortedSetDocValues;
|
|||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.SortedSetSelector;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersIndexOutput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -66,6 +68,7 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
private final SegmentWriteState state;
|
||||
private byte[] termsDictBuffer;
|
||||
|
||||
/** expert: Creates a new writer */
|
||||
public Lucene80DocValuesConsumer(
|
||||
|
@ -77,6 +80,9 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
Lucene80DocValuesFormat.Mode mode)
|
||||
throws IOException {
|
||||
this.mode = mode;
|
||||
if (Lucene80DocValuesFormat.Mode.BEST_COMPRESSION == this.mode) {
|
||||
this.termsDictBuffer = new byte[1 << 14];
|
||||
}
|
||||
boolean success = false;
|
||||
try {
|
||||
this.state = state;
|
||||
|
@ -736,15 +742,26 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
private void addTermsDict(SortedSetDocValues values) throws IOException {
|
||||
final long size = values.getValueCount();
|
||||
meta.writeVLong(size);
|
||||
meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT);
|
||||
boolean compress =
|
||||
Lucene80DocValuesFormat.Mode.BEST_COMPRESSION == mode
|
||||
&& values.getValueCount()
|
||||
> Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD;
|
||||
int code, blockMask, shift;
|
||||
if (compress) {
|
||||
code = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_CODE;
|
||||
blockMask = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_MASK;
|
||||
shift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
} else {
|
||||
code = shift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT;
|
||||
blockMask = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK;
|
||||
}
|
||||
|
||||
meta.writeInt(code);
|
||||
meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput();
|
||||
ByteBuffersIndexOutput addressOutput =
|
||||
new ByteBuffersIndexOutput(addressBuffer, "temp", "temp");
|
||||
meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
long numBlocks =
|
||||
(size + Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK)
|
||||
>>> Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT;
|
||||
long numBlocks = (size + blockMask) >>> shift;
|
||||
DirectMonotonicWriter writer =
|
||||
DirectMonotonicWriter.getInstance(
|
||||
meta, addressOutput, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
|
@ -752,10 +769,24 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
BytesRefBuilder previous = new BytesRefBuilder();
|
||||
long ord = 0;
|
||||
long start = data.getFilePointer();
|
||||
int maxLength = 0;
|
||||
int maxLength = 0, maxBlockLength = 0;
|
||||
TermsEnum iterator = values.termsEnum();
|
||||
|
||||
LZ4.FastCompressionHashTable ht = null;
|
||||
ByteArrayDataOutput bufferedOutput = null;
|
||||
if (compress) {
|
||||
ht = new LZ4.FastCompressionHashTable();
|
||||
bufferedOutput = new ByteArrayDataOutput(termsDictBuffer);
|
||||
}
|
||||
|
||||
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||
if ((ord & Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) == 0) {
|
||||
if ((ord & blockMask) == 0) {
|
||||
if (compress && bufferedOutput.getPosition() > 0) {
|
||||
maxBlockLength =
|
||||
Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
|
||||
bufferedOutput.reset(termsDictBuffer);
|
||||
}
|
||||
|
||||
writer.add(data.getFilePointer() - start);
|
||||
data.writeVInt(term.length);
|
||||
data.writeBytes(term.bytes, term.offset, term.length);
|
||||
|
@ -763,22 +794,40 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
final int prefixLength = StringHelper.bytesDifference(previous.get(), term);
|
||||
final int suffixLength = term.length - prefixLength;
|
||||
assert suffixLength > 0; // terms are unique
|
||||
|
||||
data.writeByte((byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4)));
|
||||
DataOutput blockOutput;
|
||||
if (compress) {
|
||||
// Will write (suffixLength + 1 byte + 2 vint) bytes. Grow the buffer in need.
|
||||
bufferedOutput = maybeGrowBuffer(bufferedOutput, suffixLength + 11);
|
||||
blockOutput = bufferedOutput;
|
||||
} else {
|
||||
blockOutput = data;
|
||||
}
|
||||
blockOutput.writeByte(
|
||||
(byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4)));
|
||||
if (prefixLength >= 15) {
|
||||
data.writeVInt(prefixLength - 15);
|
||||
blockOutput.writeVInt(prefixLength - 15);
|
||||
}
|
||||
if (suffixLength >= 16) {
|
||||
data.writeVInt(suffixLength - 16);
|
||||
blockOutput.writeVInt(suffixLength - 16);
|
||||
}
|
||||
data.writeBytes(term.bytes, term.offset + prefixLength, term.length - prefixLength);
|
||||
blockOutput.writeBytes(term.bytes, term.offset + prefixLength, suffixLength);
|
||||
}
|
||||
maxLength = Math.max(maxLength, term.length);
|
||||
previous.copyBytes(term);
|
||||
++ord;
|
||||
}
|
||||
// Compress and write out the last block
|
||||
if (compress && bufferedOutput.getPosition() > 0) {
|
||||
maxBlockLength =
|
||||
Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
|
||||
}
|
||||
|
||||
writer.finish();
|
||||
meta.writeInt(maxLength);
|
||||
if (compress) {
|
||||
// Write one more int for storing max block length. For compressed terms dict only.
|
||||
meta.writeInt(maxBlockLength);
|
||||
}
|
||||
meta.writeLong(start);
|
||||
meta.writeLong(data.getFilePointer() - start);
|
||||
start = data.getFilePointer();
|
||||
|
@ -790,6 +839,27 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
|
|||
writeTermsIndex(values);
|
||||
}
|
||||
|
||||
private int compressAndGetTermsDictBlockLength(
|
||||
ByteArrayDataOutput bufferedOutput, LZ4.FastCompressionHashTable ht) throws IOException {
|
||||
int uncompressedLength = bufferedOutput.getPosition();
|
||||
data.writeVInt(uncompressedLength);
|
||||
long before = data.getFilePointer();
|
||||
LZ4.compress(termsDictBuffer, 0, uncompressedLength, data, ht);
|
||||
int compressedLength = (int) (data.getFilePointer() - before);
|
||||
// Block length will be used for creating buffer for decompression, one corner case is that
|
||||
// compressed length might be bigger than un-compressed length, so just return the bigger one.
|
||||
return Math.max(uncompressedLength, compressedLength);
|
||||
}
|
||||
|
||||
private ByteArrayDataOutput maybeGrowBuffer(ByteArrayDataOutput bufferedOutput, int termLength) {
|
||||
int pos = bufferedOutput.getPosition(), originalLength = termsDictBuffer.length;
|
||||
if (pos + termLength >= originalLength - 1) {
|
||||
termsDictBuffer = ArrayUtil.grow(termsDictBuffer, originalLength + termLength);
|
||||
bufferedOutput = new ByteArrayDataOutput(termsDictBuffer, pos, termsDictBuffer.length - pos);
|
||||
}
|
||||
return bufferedOutput;
|
||||
}
|
||||
|
||||
private void writeTermsIndex(SortedSetDocValues values) throws IOException {
|
||||
final long size = values.getValueCount();
|
||||
meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
|
||||
|
|
|
@ -203,6 +203,15 @@ public final class Lucene80DocValuesFormat extends DocValuesFormat {
|
|||
static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
|
||||
static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;
|
||||
|
||||
static final int TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD = 32;
|
||||
static final int TERMS_DICT_BLOCK_LZ4_SHIFT = 6;
|
||||
static final int TERMS_DICT_BLOCK_LZ4_SIZE = 1 << TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
static final int TERMS_DICT_BLOCK_LZ4_MASK = TERMS_DICT_BLOCK_LZ4_SIZE - 1;
|
||||
static final int TERMS_DICT_COMPRESSOR_LZ4_CODE = 1;
|
||||
// Writing a special code so we know this is a LZ4-compressed block.
|
||||
static final int TERMS_DICT_BLOCK_LZ4_CODE =
|
||||
TERMS_DICT_BLOCK_LZ4_SHIFT << 16 | TERMS_DICT_COMPRESSOR_LZ4_CODE;
|
||||
|
||||
static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10;
|
||||
static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
|
||||
static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
|
||||
|
|
|
@ -38,7 +38,9 @@ import org.apache.lucene.index.SortedNumericDocValues;
|
|||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.RandomAccessInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -285,12 +287,24 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
|
||||
private static void readTermDict(IndexInput meta, TermsDictEntry entry) throws IOException {
|
||||
entry.termsDictSize = meta.readVLong();
|
||||
entry.termsDictBlockShift = meta.readInt();
|
||||
int termsDictBlockCode = meta.readInt();
|
||||
if (Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_CODE == termsDictBlockCode) {
|
||||
// This is a LZ4 compressed block.
|
||||
entry.compressed = true;
|
||||
entry.termsDictBlockShift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
} else {
|
||||
entry.termsDictBlockShift = termsDictBlockCode;
|
||||
}
|
||||
|
||||
final int blockShift = meta.readInt();
|
||||
final long addressesSize =
|
||||
(entry.termsDictSize + (1L << entry.termsDictBlockShift) - 1) >>> entry.termsDictBlockShift;
|
||||
entry.termsAddressesMeta = DirectMonotonicReader.loadMeta(meta, addressesSize, blockShift);
|
||||
entry.maxTermLength = meta.readInt();
|
||||
// Read one more int for compressed term dict.
|
||||
if (entry.compressed) {
|
||||
entry.maxBlockLength = meta.readInt();
|
||||
}
|
||||
entry.termsDataOffset = meta.readLong();
|
||||
entry.termsDataLength = meta.readLong();
|
||||
entry.termsAddressesOffset = meta.readLong();
|
||||
|
@ -375,6 +389,9 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
long termsIndexLength;
|
||||
long termsIndexAddressesOffset;
|
||||
long termsIndexAddressesLength;
|
||||
|
||||
boolean compressed;
|
||||
int maxBlockLength;
|
||||
}
|
||||
|
||||
private static class SortedEntry extends TermsDictEntry {
|
||||
|
@ -1149,6 +1166,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
|
||||
private static class TermsDict extends BaseTermsEnum {
|
||||
static final int LZ4_DECOMPRESSOR_PADDING = 7;
|
||||
|
||||
final TermsDictEntry entry;
|
||||
final LongValues blockAddresses;
|
||||
|
@ -1159,6 +1177,11 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
final BytesRef term;
|
||||
long ord = -1;
|
||||
|
||||
BytesRef blockBuffer = null;
|
||||
ByteArrayDataInput blockInput = null;
|
||||
long currentCompressedBlockStart = -1;
|
||||
long currentCompressedBlockEnd = -1;
|
||||
|
||||
TermsDict(TermsDictEntry entry, IndexInput data) throws IOException {
|
||||
this.entry = entry;
|
||||
RandomAccessInput addressesSlice =
|
||||
|
@ -1172,6 +1195,12 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
DirectMonotonicReader.getInstance(entry.termsIndexAddressesMeta, indexAddressesSlice);
|
||||
indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength);
|
||||
term = new BytesRef(entry.maxTermLength);
|
||||
|
||||
if (entry.compressed) {
|
||||
// add 7 padding bytes can help decompression run faster.
|
||||
int bufferSize = entry.maxBlockLength + LZ4_DECOMPRESSOR_PADDING;
|
||||
blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1179,21 +1208,27 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
if (++ord >= entry.termsDictSize) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ((ord & blockMask) == 0L) {
|
||||
term.length = bytes.readVInt();
|
||||
bytes.readBytes(term.bytes, 0, term.length);
|
||||
if (this.entry.compressed) {
|
||||
decompressBlock();
|
||||
} else {
|
||||
term.length = bytes.readVInt();
|
||||
bytes.readBytes(term.bytes, 0, term.length);
|
||||
}
|
||||
} else {
|
||||
final int token = Byte.toUnsignedInt(bytes.readByte());
|
||||
DataInput input = this.entry.compressed ? blockInput : bytes;
|
||||
final int token = Byte.toUnsignedInt(input.readByte());
|
||||
int prefixLength = token & 0x0F;
|
||||
int suffixLength = 1 + (token >>> 4);
|
||||
if (prefixLength == 15) {
|
||||
prefixLength += bytes.readVInt();
|
||||
prefixLength += input.readVInt();
|
||||
}
|
||||
if (suffixLength == 16) {
|
||||
suffixLength += bytes.readVInt();
|
||||
suffixLength += input.readVInt();
|
||||
}
|
||||
term.length = prefixLength + suffixLength;
|
||||
bytes.readBytes(term.bytes, prefixLength, suffixLength);
|
||||
input.readBytes(term.bytes, prefixLength, suffixLength);
|
||||
}
|
||||
return term;
|
||||
}
|
||||
|
@ -1292,8 +1327,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
final long blockAddress = blockAddresses.get(block);
|
||||
this.ord = block << entry.termsDictBlockShift;
|
||||
bytes.seek(blockAddress);
|
||||
term.length = bytes.readVInt();
|
||||
bytes.readBytes(term.bytes, 0, term.length);
|
||||
if (this.entry.compressed) {
|
||||
decompressBlock();
|
||||
} else {
|
||||
term.length = bytes.readVInt();
|
||||
bytes.readBytes(term.bytes, 0, term.length);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
int cmp = term.compareTo(text);
|
||||
if (cmp == 0) {
|
||||
|
@ -1307,6 +1347,30 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
|
|||
}
|
||||
}
|
||||
|
||||
private void decompressBlock() throws IOException {
|
||||
// The first term is kept uncompressed, so no need to decompress block if only
|
||||
// look up the first term when doing seek block.
|
||||
term.length = bytes.readVInt();
|
||||
bytes.readBytes(term.bytes, 0, term.length);
|
||||
long offset = bytes.getFilePointer();
|
||||
if (offset < entry.termsDataLength - 1) {
|
||||
// Avoid decompress again if we are reading a same block.
|
||||
if (currentCompressedBlockStart != offset) {
|
||||
int decompressLength = bytes.readVInt();
|
||||
// Decompress the remaining of current block
|
||||
LZ4.decompress(bytes, decompressLength, blockBuffer.bytes, 0);
|
||||
currentCompressedBlockStart = offset;
|
||||
currentCompressedBlockEnd = bytes.getFilePointer();
|
||||
} else {
|
||||
// Skip decompression but need to re-seek to block end.
|
||||
bytes.seek(currentCompressedBlockEnd);
|
||||
}
|
||||
|
||||
// Reset the buffer.
|
||||
blockInput = new ByteArrayDataInput(blockBuffer.bytes, 0, blockBuffer.length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() throws IOException {
|
||||
return term;
|
||||
|
|
|
@ -0,0 +1,314 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene80;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestDocValuesCompression extends LuceneTestCase {
|
||||
private final Codec bestSpeed = new Lucene90Codec(Lucene90Codec.Mode.BEST_SPEED);
|
||||
private final Codec bestCompression = new Lucene90Codec(Lucene90Codec.Mode.BEST_COMPRESSION);
|
||||
|
||||
public void testTermsDictCompressionForLowCardinalityFields() throws IOException {
|
||||
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD - 1;
|
||||
Set<String> valuesSet = new HashSet<>();
|
||||
for (int i = 0; i < CARDINALITY; ++i) {
|
||||
final int length = TestUtil.nextInt(random(), 10, 30);
|
||||
String value = TestUtil.randomSimpleString(random(), length);
|
||||
valuesSet.add(value);
|
||||
}
|
||||
|
||||
List<String> values = new ArrayList<>(valuesSet);
|
||||
long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values);
|
||||
long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values);
|
||||
|
||||
// Ensure terms dict data was not compressed for low-cardinality fields.
|
||||
assertEquals(sizeForBestSpeed, sizeForBestCompression);
|
||||
}
|
||||
|
||||
public void testTermsDictCompressionForHighCardinalityFields() throws IOException {
|
||||
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
|
||||
Set<String> valuesSet = new HashSet<>();
|
||||
for (int i = 0; i < CARDINALITY; ++i) {
|
||||
final int length = TestUtil.nextInt(random(), 10, 30);
|
||||
String value = TestUtil.randomSimpleString(random(), length);
|
||||
// Add common suffix for better compression ratio.
|
||||
valuesSet.add(value + "_CommonPartBetterForCompression");
|
||||
}
|
||||
|
||||
List<String> values = new ArrayList<>(valuesSet);
|
||||
long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values);
|
||||
long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values);
|
||||
|
||||
// Compression happened.
|
||||
assertTrue(sizeForBestCompression < sizeForBestSpeed);
|
||||
}
|
||||
|
||||
public void testReseekAfterSkipDecompression() throws IOException {
|
||||
final int CARDINALITY = (Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE << 1) + 11;
|
||||
Set<String> valueSet = new HashSet<>(CARDINALITY);
|
||||
for (int i = 0; i < CARDINALITY; i++) {
|
||||
valueSet.add(TestUtil.randomSimpleString(random(), 64));
|
||||
}
|
||||
List<String> values = new ArrayList<>(valueSet);
|
||||
Collections.sort(values);
|
||||
// Create one non-existent value just between block-1 and block-2.
|
||||
String nonexistentValue =
|
||||
values.get(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE - 1)
|
||||
+ TestUtil.randomSimpleString(random(), 64, 128);
|
||||
int docValues = values.size();
|
||||
|
||||
try (Directory directory = newDirectory()) {
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
IndexWriterConfig config = new IndexWriterConfig(analyzer);
|
||||
config.setCodec(bestCompression);
|
||||
config.setUseCompoundFile(false);
|
||||
IndexWriter writer = new IndexWriter(directory, config);
|
||||
for (int i = 0; i < 280; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
|
||||
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % docValues))));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.commit();
|
||||
writer.forceMerge(1);
|
||||
DirectoryReader dReader = DirectoryReader.open(writer);
|
||||
writer.close();
|
||||
|
||||
LeafReader reader = getOnlyLeafReader(dReader);
|
||||
// Check values count.
|
||||
SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv");
|
||||
assertEquals(docValues, ssdvMulti.getValueCount());
|
||||
|
||||
// Seek to first block.
|
||||
int ord1 = ssdvMulti.lookupTerm(new BytesRef(values.get(0)));
|
||||
assertTrue(ord1 >= 0);
|
||||
int ord2 = ssdvMulti.lookupTerm(new BytesRef(values.get(1)));
|
||||
assertTrue(ord2 >= ord1);
|
||||
// Ensure re-seek logic is correct after skip-decompression.
|
||||
int nonexistentOrd2 = ssdvMulti.lookupTerm(new BytesRef(nonexistentValue));
|
||||
assertTrue(nonexistentOrd2 < 0);
|
||||
dReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testLargeTermsCompression() throws IOException {
|
||||
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
|
||||
Set<String> valuesSet = new HashSet<>();
|
||||
for (int i = 0; i < CARDINALITY; ++i) {
|
||||
final int length = TestUtil.nextInt(random(), 512, 1024);
|
||||
valuesSet.add(TestUtil.randomSimpleString(random(), length));
|
||||
}
|
||||
int valuesCount = valuesSet.size();
|
||||
List<String> values = new ArrayList<>(valuesSet);
|
||||
|
||||
try (Directory directory = newDirectory()) {
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
IndexWriterConfig config = new IndexWriterConfig(analyzer);
|
||||
config.setCodec(bestCompression);
|
||||
config.setUseCompoundFile(false);
|
||||
IndexWriter writer = new IndexWriter(directory, config);
|
||||
for (int i = 0; i < 256; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
|
||||
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.commit();
|
||||
writer.forceMerge(1);
|
||||
DirectoryReader ireader = DirectoryReader.open(writer);
|
||||
writer.close();
|
||||
|
||||
LeafReader reader = getOnlyLeafReader(ireader);
|
||||
// Check values count.
|
||||
SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv");
|
||||
assertEquals(valuesCount, ssdvMulti.getValueCount());
|
||||
ireader.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the old segment can be merged together with the new compressed segment.
|
||||
public void testMergeWithUncompressedSegment() throws IOException {
|
||||
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
|
||||
Set<String> valuesSet = new HashSet<>();
|
||||
for (int i = 0; i < CARDINALITY; ++i) {
|
||||
final int length = TestUtil.nextInt(random(), 10, 30);
|
||||
// Add common suffix for better compression ratio.
|
||||
valuesSet.add(TestUtil.randomSimpleString(random(), length));
|
||||
}
|
||||
List<String> values = new ArrayList<>(valuesSet);
|
||||
int valuesCount = values.size();
|
||||
|
||||
try (Directory directory = newDirectory()) {
|
||||
// 1. Write 256 documents without terms dict compression.
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
IndexWriterConfig config = new IndexWriterConfig(analyzer);
|
||||
config.setCodec(bestSpeed);
|
||||
config.setUseCompoundFile(false);
|
||||
IndexWriter writer = new IndexWriter(directory, config);
|
||||
for (int i = 0; i < 256; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
|
||||
doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount))));
|
||||
doc.add(
|
||||
new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 1) % valuesCount))));
|
||||
doc.add(
|
||||
new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 2) % valuesCount))));
|
||||
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.commit();
|
||||
DirectoryReader ireader = DirectoryReader.open(writer);
|
||||
assertEquals(256, ireader.numDocs());
|
||||
LeafReader reader = getOnlyLeafReader(ireader);
|
||||
SortedSetDocValues ssdv = reader.getSortedSetDocValues("ssdv");
|
||||
assertEquals(valuesCount, ssdv.getValueCount());
|
||||
SortedDocValues sdv = reader.getSortedDocValues("sdv");
|
||||
assertEquals(valuesCount, sdv.getValueCount());
|
||||
ireader.close();
|
||||
writer.close();
|
||||
|
||||
// 2. Add another 100 documents, and enabling terms dict compression.
|
||||
config = new IndexWriterConfig(analyzer);
|
||||
config.setCodec(bestCompression);
|
||||
config.setUseCompoundFile(false);
|
||||
writer = new IndexWriter(directory, config);
|
||||
// Add 2 new values.
|
||||
valuesSet.add(TestUtil.randomSimpleString(random(), 10));
|
||||
valuesSet.add(TestUtil.randomSimpleString(random(), 10));
|
||||
values = new ArrayList<>(valuesSet);
|
||||
valuesCount = valuesSet.size();
|
||||
|
||||
for (int i = 256; i < 356; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
|
||||
doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount))));
|
||||
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.commit();
|
||||
writer.forceMerge(1);
|
||||
ireader = DirectoryReader.open(writer);
|
||||
assertEquals(356, ireader.numDocs());
|
||||
reader = getOnlyLeafReader(ireader);
|
||||
ssdv = reader.getSortedSetDocValues("ssdv");
|
||||
assertEquals(valuesCount, ssdv.getValueCount());
|
||||
ireader.close();
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
|
||||
private static long writeAndGetDocValueFileSize(Codec codec, List<String> values)
|
||||
throws IOException {
|
||||
int valuesCount = values.size();
|
||||
long dvdFileSize = -1;
|
||||
try (Directory directory = newDirectory()) {
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
IndexWriterConfig config = new IndexWriterConfig(analyzer);
|
||||
config.setCodec(codec);
|
||||
config.setUseCompoundFile(false);
|
||||
IndexWriter writer = new IndexWriter(directory, config);
|
||||
for (int i = 0; i < 256; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
|
||||
// Multi value sorted-set field.
|
||||
doc.add(
|
||||
new SortedSetDocValuesField("ssdv_multi_", new BytesRef(values.get(i % valuesCount))));
|
||||
doc.add(
|
||||
new SortedSetDocValuesField(
|
||||
"ssdv_multi_", new BytesRef(values.get((i + 1) % valuesCount))));
|
||||
doc.add(
|
||||
new SortedSetDocValuesField(
|
||||
"ssdv_multi_", new BytesRef(values.get((i + 2) % valuesCount))));
|
||||
// Single value sorted-set field.
|
||||
doc.add(
|
||||
new SortedSetDocValuesField("ssdv_single_", new BytesRef(values.get(i % valuesCount))));
|
||||
// Sorted field.
|
||||
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.commit();
|
||||
writer.forceMerge(1);
|
||||
DirectoryReader ireader = DirectoryReader.open(writer);
|
||||
writer.close();
|
||||
|
||||
LeafReader reader = getOnlyLeafReader(ireader);
|
||||
// Check values count.
|
||||
SortedSetDocValues ssdvMulti = reader.getSortedSetDocValues("ssdv_multi_");
|
||||
assertEquals(valuesCount, ssdvMulti.getValueCount());
|
||||
for (int i = 0; i < valuesCount; i++) {
|
||||
BytesRef term = ssdvMulti.lookupOrd(i);
|
||||
assertTrue(term.bytes.length > 0);
|
||||
}
|
||||
for (int i = 0; i < valuesCount; i++) {
|
||||
for (int j = 0; j < 3; j++) {
|
||||
assertTrue(ssdvMulti.lookupTerm(new BytesRef(values.get((i + j) % valuesCount))) >= 0);
|
||||
}
|
||||
}
|
||||
|
||||
SortedSetDocValues ssdvSingle = reader.getSortedSetDocValues("ssdv_single_");
|
||||
assertEquals(valuesCount, ssdvSingle.getValueCount());
|
||||
for (int i = 0; i < valuesCount; i++) {
|
||||
assertTrue(ssdvSingle.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0);
|
||||
}
|
||||
|
||||
SortedDocValues sdv = reader.getSortedDocValues("sdv");
|
||||
assertEquals(valuesCount, sdv.getValueCount());
|
||||
for (int i = 0; i < valuesCount; i++) {
|
||||
assertTrue(sdv.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0);
|
||||
}
|
||||
|
||||
dvdFileSize = docValueFileSize(directory);
|
||||
assertTrue(dvdFileSize > 0);
|
||||
ireader.close();
|
||||
}
|
||||
|
||||
return dvdFileSize;
|
||||
}
|
||||
|
||||
static long docValueFileSize(Directory d) throws IOException {
|
||||
for (String file : d.listAll()) {
|
||||
if (file.endsWith(Lucene80DocValuesFormat.DATA_EXTENSION)) {
|
||||
return d.fileLength(file);
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue