LUCENE-5159: prefix-code the sorted/sortedset value dictionaries in DiskDV

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-08-10 00:37:41 +00:00
parent b2070fefbc
commit 9cb38dd42a
8 changed files with 534 additions and 17 deletions

View File

@ -168,6 +168,9 @@ Optimizations
* LUCENE-5150: Make WAH8DocIdSet able to inverse its encoding in order to
compress dense sets efficiently as well. (Adrien Grand)
* LUCENE-5159: Prefix-code the sorted/sortedset value dictionaries in DiskDV.
(Robert Muir)
Documentation
* LUCENE-4894: remove facet userguide as it was outdated. Partially absorbed into

View File

@ -27,9 +27,11 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
@ -38,6 +40,7 @@ import org.apache.lucene.util.packed.PackedInts;
public class DiskDocValuesConsumer extends DocValuesConsumer {
static final int BLOCK_SIZE = 16384;
static final int ADDRESS_INTERVAL = 16;
/** Compressed using packed blocks of ints. */
public static final int DELTA_COMPRESSED = 0;
@ -45,6 +48,13 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
public static final int GCD_COMPRESSED = 1;
/** Compressed by giving IDs to unique values. */
public static final int TABLE_COMPRESSED = 2;
/** Uncompressed binary, written directly (fixed length). */
public static final int BINARY_FIXED_UNCOMPRESSED = 0;
/** Uncompressed binary, written directly (variable length). */
public static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
/** Compressed binary with shared prefixes */
public static final int BINARY_PREFIX_COMPRESSED = 2;
final IndexOutput data, meta;
final int maxDoc;
@ -173,7 +183,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
}
@Override
public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
// write the byte[] data
meta.writeVInt(field.number);
meta.writeByte(DiskDocValuesFormat.BINARY);
@ -187,6 +197,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
data.writeBytes(v.bytes, v.offset, v.length);
count++;
}
meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
@ -208,12 +219,68 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
writer.finish();
}
}
protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
// first check if its a "fixed-length" terms dict
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
for (BytesRef v : values) {
minLength = Math.min(minLength, v.length);
maxLength = Math.max(maxLength, v.length);
}
if (minLength == maxLength) {
// no index needed: direct addressing by mult
addBinaryField(field, values);
} else {
// header
meta.writeVInt(field.number);
meta.writeByte(DiskDocValuesFormat.BINARY);
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
// now write the bytes: sharing prefixes within a block
final long startFP = data.getFilePointer();
// currently, we have to store the delta from expected for every 1/nth term
// we could avoid this, but its not much and less overall RAM than the previous approach!
RAMOutputStream addressBuffer = new RAMOutputStream();
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
BytesRef lastTerm = new BytesRef();
long count = 0;
for (BytesRef v : values) {
if (count % ADDRESS_INTERVAL == 0) {
termAddresses.add(data.getFilePointer() - startFP);
// force the first term in a block to be abs-encoded
lastTerm.length = 0;
}
// prefix-code
int sharedPrefix = StringHelper.bytesDifference(lastTerm, v);
data.writeVInt(sharedPrefix);
data.writeVInt(v.length - sharedPrefix);
data.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
lastTerm.copyBytes(v);
count++;
}
final long indexStartFP = data.getFilePointer();
// write addresses of indexed terms
termAddresses.finish();
addressBuffer.writeTo(data);
addressBuffer = null;
termAddresses = null;
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
meta.writeLong(startFP);
meta.writeVInt(ADDRESS_INTERVAL);
meta.writeLong(indexStartFP);
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeVInt(BLOCK_SIZE);
}
}
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(DiskDocValuesFormat.SORTED);
addBinaryField(field, values);
addTermsDict(field, values);
addNumericField(field, docToOrd, false);
}
@ -222,7 +289,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
meta.writeVInt(field.number);
meta.writeByte(DiskDocValuesFormat.SORTED_SET);
// write the ord -> byte[] as a binary field
addBinaryField(field, values);
addTermsDict(field, values);
// write the stream of ords as a numeric field
// NOTE: we could return an iterator that delta-encodes these within a doc
addNumericField(field, ords, false);

View File

@ -53,7 +53,8 @@ public final class DiskDocValuesFormat extends DocValuesFormat {
public static final String META_CODEC = "DiskDocValuesMetadata";
public static final String META_EXTENSION = "dvdm";
public static final int VERSION_START = 0;
public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_COMPRESSED_TERMS = 1;
public static final int VERSION_CURRENT = VERSION_COMPRESSED_TERMS;
public static final byte NUMERIC = 0;
public static final byte BINARY = 1;
public static final byte SORTED = 2;

View File

@ -21,7 +21,12 @@ import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.DELTA_COMPRE
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED;
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED;
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED;
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED;
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_PREFIX_COMPRESSED;
import java.io.IOException;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
@ -29,6 +34,8 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@ -36,7 +43,10 @@ import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.BlockPackedReader;
@ -62,7 +72,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
final int version;
try {
version = CodecUtil.checkHeader(in, metaCodec,
DiskDocValuesFormat.VERSION_START,
DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
numerics = new HashMap<Integer,NumericEntry>();
ords = new HashMap<Integer,NumericEntry>();
@ -84,7 +94,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.openInput(dataName, state.context);
final int version2 = CodecUtil.checkHeader(data, dataCodec,
DiskDocValuesFormat.VERSION_START,
DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
if (version != version2) {
throw new CorruptIndexException("Format versions mismatch");
@ -196,14 +206,27 @@ class DiskDocValuesProducer extends DocValuesProducer {
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
entry.format = meta.readVInt();
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
entry.count = meta.readVLong();
entry.offset = meta.readLong();
if (entry.minLength != entry.maxLength) {
entry.addressesOffset = meta.readLong();
entry.packedIntsVersion = meta.readVInt();
entry.blockSize = meta.readVInt();
switch(entry.format) {
case BINARY_FIXED_UNCOMPRESSED:
break;
case BINARY_PREFIX_COMPRESSED:
entry.addressInterval = meta.readVInt();
entry.addressesOffset = meta.readLong();
entry.packedIntsVersion = meta.readVInt();
entry.blockSize = meta.readVInt();
break;
case BINARY_VARIABLE_UNCOMPRESSED:
entry.addressesOffset = meta.readLong();
entry.packedIntsVersion = meta.readVInt();
entry.blockSize = meta.readVInt();
break;
default:
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
}
return entry;
}
@ -255,10 +278,15 @@ class DiskDocValuesProducer extends DocValuesProducer {
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry bytes = binaries.get(field.number);
if (bytes.minLength == bytes.maxLength) {
return getFixedBinary(field, bytes);
} else {
return getVariableBinary(field, bytes);
switch(bytes.format) {
case BINARY_FIXED_UNCOMPRESSED:
return getFixedBinary(field, bytes);
case BINARY_VARIABLE_UNCOMPRESSED:
return getVariableBinary(field, bytes);
case BINARY_PREFIX_COMPRESSED:
return getCompressedBinary(field, bytes);
default:
throw new AssertionError();
}
}
@ -321,6 +349,30 @@ class DiskDocValuesProducer extends DocValuesProducer {
};
}
private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
final IndexInput data = this.data.clone();
final long interval = bytes.addressInterval;
final MonotonicBlockPackedReader addresses;
synchronized (addressInstances) {
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
if (addrInstance == null) {
data.seek(bytes.addressesOffset);
final long size;
if (bytes.count % interval == 0) {
size = bytes.count / interval;
} else {
size = 1L + bytes.count / interval;
}
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
addressInstances.put(field.number, addrInstance);
}
addresses = addrInstance;
}
return new CompressedBinaryDocValues(bytes, addresses, data);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
final int valueCount = (int) binaries.get(field.number).count;
@ -346,6 +398,24 @@ class DiskDocValuesProducer extends DocValuesProducer {
public int getValueCount() {
return valueCount;
}
@Override
public int lookupTerm(BytesRef key) {
if (binary instanceof CompressedBinaryDocValues) {
return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key);
} else {
return super.lookupTerm(key);
}
}
@Override
public TermsEnum termsEnum() {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).getTermsEnum();
} else {
return super.termsEnum();
}
}
};
}
@ -399,6 +469,24 @@ class DiskDocValuesProducer extends DocValuesProducer {
public long getValueCount() {
return valueCount;
}
@Override
public long lookupTerm(BytesRef key) {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).lookupTerm(key);
} else {
return super.lookupTerm(key);
}
}
@Override
public TermsEnum termsEnum() {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).getTermsEnum();
} else {
return super.termsEnum();
}
}
};
}
@ -423,10 +511,12 @@ class DiskDocValuesProducer extends DocValuesProducer {
static class BinaryEntry {
long offset;
int format;
long count;
int minLength;
int maxLength;
long addressesOffset;
long addressInterval;
int packedIntsVersion;
int blockSize;
}
@ -449,4 +539,204 @@ class DiskDocValuesProducer extends DocValuesProducer {
abstract void get(long id, BytesRef Result);
}
// in the compressed case, we add a few additional operations for
// more efficient reverse lookup and enumeration
static class CompressedBinaryDocValues extends LongBinaryDocValues {
final BinaryEntry bytes;
final long interval;
final long numValues;
final long numIndexValues;
final MonotonicBlockPackedReader addresses;
final IndexInput data;
final TermsEnum termsEnum;
public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException {
this.bytes = bytes;
this.interval = bytes.addressInterval;
this.addresses = addresses;
this.data = data;
this.numValues = bytes.count;
this.numIndexValues = addresses.size();
this.termsEnum = getTermsEnum(data);
}
@Override
public void get(long id, BytesRef result) {
try {
termsEnum.seekExact(id);
BytesRef term = termsEnum.term();
result.bytes = term.bytes;
result.offset = term.offset;
result.length = term.length;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
long lookupTerm(BytesRef key) {
try {
SeekStatus status = termsEnum.seekCeil(key);
if (status == SeekStatus.END) {
return -numValues-1;
} else if (status == SeekStatus.FOUND) {
return termsEnum.ord();
} else {
return -termsEnum.ord()-1;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
TermsEnum getTermsEnum() {
try {
return getTermsEnum(data.clone());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private TermsEnum getTermsEnum(final IndexInput input) throws IOException {
input.seek(bytes.offset);
return new TermsEnum() {
private long currentOrd = -1;
// TODO: maxLength is negative when all terms are merged away...
private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength);
private final BytesRef term = new BytesRef(); // TODO: paranoia?
@Override
public BytesRef next() throws IOException {
if (doNext() == null) {
return null;
} else {
setTerm();
return term;
}
}
private BytesRef doNext() throws IOException {
if (++currentOrd >= numValues) {
return null;
} else {
int start = input.readVInt();
int suffix = input.readVInt();
input.readBytes(termBuffer.bytes, start, suffix);
termBuffer.length = start + suffix;
return termBuffer;
}
}
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
// binary-search just the index values to find the block,
// then scan within the block
long low = 0;
long high = numIndexValues-1;
while (low <= high) {
long mid = (low + high) >>> 1;
doSeek(mid * interval);
int cmp = termBuffer.compareTo(text);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
// we got lucky, found an indexed term
setTerm();
return SeekStatus.FOUND;
}
}
if (numIndexValues == 0) {
return SeekStatus.END;
}
// block before insertion point
long block = low-1;
doSeek(block < 0 ? -1 : block * interval);
while (doNext() != null) {
int cmp = termBuffer.compareTo(text);
if (cmp == 0) {
setTerm();
return SeekStatus.FOUND;
} else if (cmp > 0) {
setTerm();
return SeekStatus.NOT_FOUND;
}
}
return SeekStatus.END;
}
@Override
public void seekExact(long ord) throws IOException {
doSeek(ord);
setTerm();
}
private void doSeek(long ord) throws IOException {
long block = ord / interval;
if (ord >= currentOrd && block == currentOrd / interval) {
// seek within current block
} else {
// position before start of block
currentOrd = ord - ord % interval - 1;
input.seek(bytes.offset + addresses.get(block));
}
while (currentOrd < ord) {
doNext();
}
}
private void setTerm() {
// TODO: is there a cleaner way
term.bytes = new byte[termBuffer.length];
term.offset = 0;
term.copyBytes(termBuffer);
}
@Override
public BytesRef term() throws IOException {
return term;
}
@Override
public long ord() throws IOException {
return currentOrd;
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public int docFreq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() throws IOException {
return -1;
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
throw new UnsupportedOperationException();
}
};
}
}
}

View File

@ -78,5 +78,10 @@ public final class MonotonicBlockPackedReader {
final int idx = (int) (index & blockMask);
return minValues[block] + (long) (idx * averages[block]) + zigZagDecode(subReaders[block].get(idx));
}
/** Returns the number of values */
public long size() {
return valueCount;
}
}

View File

@ -24,8 +24,10 @@ import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.BytesRef;
/**
* DocValues format that keeps everything on disk.
@ -53,7 +55,13 @@ public final class CheapBastardDocValuesFormat extends DocValuesFormat {
return new DiskDocValuesConsumer(state, DiskDocValuesFormat.DATA_CODEC,
DiskDocValuesFormat.DATA_EXTENSION,
DiskDocValuesFormat.META_CODEC,
DiskDocValuesFormat.META_EXTENSION);
DiskDocValuesFormat.META_EXTENSION) {
// don't ever write an index, we dont want to use RAM :)
@Override
protected void addTermsDict(FieldInfo field, Iterable<BytesRef> values) throws IOException {
addBinaryField(field, values);
}
};
}
@Override

View File

@ -27,6 +27,7 @@ import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
@ -58,7 +59,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
final int version;
try {
version = CodecUtil.checkHeader(in, metaCodec,
DiskDocValuesFormat.VERSION_START,
DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
numerics = new HashMap<Integer,NumericEntry>();
ords = new HashMap<Integer,NumericEntry>();
@ -80,7 +81,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.openInput(dataName, state.context);
final int version2 = CodecUtil.checkHeader(data, dataCodec,
DiskDocValuesFormat.VERSION_START,
DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
if (version != version2) {
throw new CorruptIndexException("Versions mismatch");
@ -193,6 +194,10 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
int format = meta.readVInt();
if (format != DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) {
throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta);
}
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
entry.count = meta.readVLong();

View File

@ -1350,6 +1350,57 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
dir.close();
}
private void doTestSortedVsFieldCache(int minLength, int maxLength) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
Document doc = new Document();
Field idField = new StringField("id", "", Field.Store.NO);
Field indexedField = new StringField("indexed", "", Field.Store.NO);
Field dvField = new SortedDocValuesField("dv", new BytesRef());
doc.add(idField);
doc.add(indexedField);
doc.add(dvField);
// index some docs
int numDocs = atLeast(300);
for (int i = 0; i < numDocs; i++) {
idField.setStringValue(Integer.toString(i));
final int length;
if (minLength == maxLength) {
length = minLength; // fixed length
} else {
length = _TestUtil.nextInt(random(), minLength, maxLength);
}
String value = _TestUtil.randomSimpleString(random(), length);
indexedField.setStringValue(value);
dvField.setBytesValue(new BytesRef(value));
writer.addDocument(doc);
if (random().nextInt(31) == 0) {
writer.commit();
}
}
// delete some docs
int numDeletions = random().nextInt(numDocs/10);
for (int i = 0; i < numDeletions; i++) {
int id = random().nextInt(numDocs);
writer.deleteDocuments(new Term("id", Integer.toString(id)));
}
writer.close();
// compare
DirectoryReader ir = DirectoryReader.open(dir);
for (AtomicReaderContext context : ir.leaves()) {
AtomicReader r = context.reader();
SortedDocValues expected = FieldCache.DEFAULT.getTermsIndex(r, "indexed");
SortedDocValues actual = r.getSortedDocValues("dv");
assertEquals(r.maxDoc(), expected, actual);
}
ir.close();
dir.close();
}
public void testSortedFixedLengthVsStoredFields() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
@ -1358,6 +1409,21 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
}
}
public void testSortedFixedLengthVsFieldCache() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
int fixedLength = _TestUtil.nextInt(random(), 1, 10);
doTestSortedVsFieldCache(fixedLength, fixedLength);
}
}
public void testSortedVariableLengthVsFieldCache() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
doTestSortedVsFieldCache(1, 10);
}
}
public void testSortedVariableLengthVsStoredFields() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
@ -1905,6 +1971,10 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
}
}
private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception {
assertEquals(maxDoc, new SingletonSortedSetDocValues(expected), new SingletonSortedSetDocValues(actual));
}
private void assertEquals(int maxDoc, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception {
// can be null for the segment if no docs actually had any SortedDocValues
// in this case FC.getDocTermsOrds returns EMPTY
@ -1932,6 +2002,74 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
actual.lookupTerm(actualBytes);
assertEquals(expectedBytes, actualBytes);
}
// compare termsenum
assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum());
}
private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
BytesRef ref;
// sequential next() through all terms
while ((ref = expected.next()) != null) {
assertEquals(ref, actual.next());
assertEquals(expected.ord(), actual.ord());
assertEquals(expected.term(), actual.term());
}
assertNull(actual.next());
// sequential seekExact(ord) through all terms
for (long i = 0; i < numOrds; i++) {
expected.seekExact(i);
actual.seekExact(i);
assertEquals(expected.ord(), actual.ord());
assertEquals(expected.term(), actual.term());
}
// sequential seekExact(BytesRef) through all terms
for (long i = 0; i < numOrds; i++) {
expected.seekExact(i);
assertTrue(actual.seekExact(expected.term()));
assertEquals(expected.ord(), actual.ord());
assertEquals(expected.term(), actual.term());
}
// sequential seekCeil(BytesRef) through all terms
for (long i = 0; i < numOrds; i++) {
expected.seekExact(i);
assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
assertEquals(expected.ord(), actual.ord());
assertEquals(expected.term(), actual.term());
}
// random seekExact(ord)
for (long i = 0; i < numOrds; i++) {
long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
expected.seekExact(randomOrd);
actual.seekExact(randomOrd);
assertEquals(expected.ord(), actual.ord());
assertEquals(expected.term(), actual.term());
}
// random seekExact(BytesRef)
for (long i = 0; i < numOrds; i++) {
long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
expected.seekExact(randomOrd);
actual.seekExact(expected.term());
assertEquals(expected.ord(), actual.ord());
assertEquals(expected.term(), actual.term());
}
// random seekCeil(BytesRef)
for (long i = 0; i < numOrds; i++) {
BytesRef target = new BytesRef(_TestUtil.randomUnicodeString(random()));
SeekStatus expectedStatus = expected.seekCeil(target);
assertEquals(expectedStatus, actual.seekCeil(target));
if (expectedStatus != SeekStatus.END) {
assertEquals(expected.ord(), actual.ord());
assertEquals(expected.term(), actual.term());
}
}
}
private void doTestSortedSetVsUninvertedField(int minLength, int maxLength) throws Exception {