mirror of https://github.com/apache/lucene.git
LUCENE-5159: prefix-code the sorted/sortedset value dictionaries in DiskDV
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b2070fefbc
commit
9cb38dd42a
|
@ -168,6 +168,9 @@ Optimizations
|
|||
* LUCENE-5150: Make WAH8DocIdSet able to inverse its encoding in order to
|
||||
compress dense sets efficiently as well. (Adrien Grand)
|
||||
|
||||
* LUCENE-5159: Prefix-code the sorted/sortedset value dictionaries in DiskDV.
|
||||
(Robert Muir)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-4894: remove facet userguide as it was outdated. Partially absorbed into
|
||||
|
|
|
@ -27,9 +27,11 @@ import org.apache.lucene.index.FieldInfo;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.MathUtil;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
@ -38,6 +40,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
public class DiskDocValuesConsumer extends DocValuesConsumer {
|
||||
|
||||
static final int BLOCK_SIZE = 16384;
|
||||
static final int ADDRESS_INTERVAL = 16;
|
||||
|
||||
/** Compressed using packed blocks of ints. */
|
||||
public static final int DELTA_COMPRESSED = 0;
|
||||
|
@ -45,6 +48,13 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
public static final int GCD_COMPRESSED = 1;
|
||||
/** Compressed by giving IDs to unique values. */
|
||||
public static final int TABLE_COMPRESSED = 2;
|
||||
|
||||
/** Uncompressed binary, written directly (fixed length). */
|
||||
public static final int BINARY_FIXED_UNCOMPRESSED = 0;
|
||||
/** Uncompressed binary, written directly (variable length). */
|
||||
public static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
|
||||
/** Compressed binary with shared prefixes */
|
||||
public static final int BINARY_PREFIX_COMPRESSED = 2;
|
||||
|
||||
final IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
|
@ -173,7 +183,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
|
||||
// write the byte[] data
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(DiskDocValuesFormat.BINARY);
|
||||
|
@ -187,6 +197,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
data.writeBytes(v.bytes, v.offset, v.length);
|
||||
count++;
|
||||
}
|
||||
meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
|
||||
meta.writeVInt(minLength);
|
||||
meta.writeVInt(maxLength);
|
||||
meta.writeVLong(count);
|
||||
|
@ -208,12 +219,68 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
writer.finish();
|
||||
}
|
||||
}
|
||||
|
||||
protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
// first check if its a "fixed-length" terms dict
|
||||
int minLength = Integer.MAX_VALUE;
|
||||
int maxLength = Integer.MIN_VALUE;
|
||||
for (BytesRef v : values) {
|
||||
minLength = Math.min(minLength, v.length);
|
||||
maxLength = Math.max(maxLength, v.length);
|
||||
}
|
||||
if (minLength == maxLength) {
|
||||
// no index needed: direct addressing by mult
|
||||
addBinaryField(field, values);
|
||||
} else {
|
||||
// header
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(DiskDocValuesFormat.BINARY);
|
||||
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
|
||||
// now write the bytes: sharing prefixes within a block
|
||||
final long startFP = data.getFilePointer();
|
||||
// currently, we have to store the delta from expected for every 1/nth term
|
||||
// we could avoid this, but its not much and less overall RAM than the previous approach!
|
||||
RAMOutputStream addressBuffer = new RAMOutputStream();
|
||||
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
|
||||
BytesRef lastTerm = new BytesRef();
|
||||
long count = 0;
|
||||
for (BytesRef v : values) {
|
||||
if (count % ADDRESS_INTERVAL == 0) {
|
||||
termAddresses.add(data.getFilePointer() - startFP);
|
||||
// force the first term in a block to be abs-encoded
|
||||
lastTerm.length = 0;
|
||||
}
|
||||
|
||||
// prefix-code
|
||||
int sharedPrefix = StringHelper.bytesDifference(lastTerm, v);
|
||||
data.writeVInt(sharedPrefix);
|
||||
data.writeVInt(v.length - sharedPrefix);
|
||||
data.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
|
||||
lastTerm.copyBytes(v);
|
||||
count++;
|
||||
}
|
||||
final long indexStartFP = data.getFilePointer();
|
||||
// write addresses of indexed terms
|
||||
termAddresses.finish();
|
||||
addressBuffer.writeTo(data);
|
||||
addressBuffer = null;
|
||||
termAddresses = null;
|
||||
meta.writeVInt(minLength);
|
||||
meta.writeVInt(maxLength);
|
||||
meta.writeVLong(count);
|
||||
meta.writeLong(startFP);
|
||||
meta.writeVInt(ADDRESS_INTERVAL);
|
||||
meta.writeLong(indexStartFP);
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
meta.writeVInt(BLOCK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(DiskDocValuesFormat.SORTED);
|
||||
addBinaryField(field, values);
|
||||
addTermsDict(field, values);
|
||||
addNumericField(field, docToOrd, false);
|
||||
}
|
||||
|
||||
|
@ -222,7 +289,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
meta.writeVInt(field.number);
|
||||
meta.writeByte(DiskDocValuesFormat.SORTED_SET);
|
||||
// write the ord -> byte[] as a binary field
|
||||
addBinaryField(field, values);
|
||||
addTermsDict(field, values);
|
||||
// write the stream of ords as a numeric field
|
||||
// NOTE: we could return an iterator that delta-encodes these within a doc
|
||||
addNumericField(field, ords, false);
|
||||
|
|
|
@ -53,7 +53,8 @@ public final class DiskDocValuesFormat extends DocValuesFormat {
|
|||
public static final String META_CODEC = "DiskDocValuesMetadata";
|
||||
public static final String META_EXTENSION = "dvdm";
|
||||
public static final int VERSION_START = 0;
|
||||
public static final int VERSION_CURRENT = VERSION_START;
|
||||
public static final int VERSION_COMPRESSED_TERMS = 1;
|
||||
public static final int VERSION_CURRENT = VERSION_COMPRESSED_TERMS;
|
||||
public static final byte NUMERIC = 0;
|
||||
public static final byte BINARY = 1;
|
||||
public static final byte SORTED = 2;
|
||||
|
|
|
@ -21,7 +21,12 @@ import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.DELTA_COMPRE
|
|||
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED;
|
||||
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED;
|
||||
|
||||
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED;
|
||||
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED;
|
||||
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_PREFIX_COMPRESSED;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -29,6 +34,8 @@ import org.apache.lucene.codecs.CodecUtil;
|
|||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
|
@ -36,7 +43,10 @@ import org.apache.lucene.index.NumericDocValues;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.BlockPackedReader;
|
||||
|
@ -62,7 +72,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
final int version;
|
||||
try {
|
||||
version = CodecUtil.checkHeader(in, metaCodec,
|
||||
DiskDocValuesFormat.VERSION_START,
|
||||
DiskDocValuesFormat.VERSION_CURRENT,
|
||||
DiskDocValuesFormat.VERSION_CURRENT);
|
||||
numerics = new HashMap<Integer,NumericEntry>();
|
||||
ords = new HashMap<Integer,NumericEntry>();
|
||||
|
@ -84,7 +94,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.openInput(dataName, state.context);
|
||||
final int version2 = CodecUtil.checkHeader(data, dataCodec,
|
||||
DiskDocValuesFormat.VERSION_START,
|
||||
DiskDocValuesFormat.VERSION_CURRENT,
|
||||
DiskDocValuesFormat.VERSION_CURRENT);
|
||||
if (version != version2) {
|
||||
throw new CorruptIndexException("Format versions mismatch");
|
||||
|
@ -196,14 +206,27 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
|
||||
BinaryEntry entry = new BinaryEntry();
|
||||
entry.format = meta.readVInt();
|
||||
entry.minLength = meta.readVInt();
|
||||
entry.maxLength = meta.readVInt();
|
||||
entry.count = meta.readVLong();
|
||||
entry.offset = meta.readLong();
|
||||
if (entry.minLength != entry.maxLength) {
|
||||
entry.addressesOffset = meta.readLong();
|
||||
entry.packedIntsVersion = meta.readVInt();
|
||||
entry.blockSize = meta.readVInt();
|
||||
switch(entry.format) {
|
||||
case BINARY_FIXED_UNCOMPRESSED:
|
||||
break;
|
||||
case BINARY_PREFIX_COMPRESSED:
|
||||
entry.addressInterval = meta.readVInt();
|
||||
entry.addressesOffset = meta.readLong();
|
||||
entry.packedIntsVersion = meta.readVInt();
|
||||
entry.blockSize = meta.readVInt();
|
||||
break;
|
||||
case BINARY_VARIABLE_UNCOMPRESSED:
|
||||
entry.addressesOffset = meta.readLong();
|
||||
entry.packedIntsVersion = meta.readVInt();
|
||||
entry.blockSize = meta.readVInt();
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
@ -255,10 +278,15 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
BinaryEntry bytes = binaries.get(field.number);
|
||||
if (bytes.minLength == bytes.maxLength) {
|
||||
return getFixedBinary(field, bytes);
|
||||
} else {
|
||||
return getVariableBinary(field, bytes);
|
||||
switch(bytes.format) {
|
||||
case BINARY_FIXED_UNCOMPRESSED:
|
||||
return getFixedBinary(field, bytes);
|
||||
case BINARY_VARIABLE_UNCOMPRESSED:
|
||||
return getVariableBinary(field, bytes);
|
||||
case BINARY_PREFIX_COMPRESSED:
|
||||
return getCompressedBinary(field, bytes);
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -321,6 +349,30 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
};
|
||||
}
|
||||
|
||||
private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
|
||||
final IndexInput data = this.data.clone();
|
||||
final long interval = bytes.addressInterval;
|
||||
|
||||
final MonotonicBlockPackedReader addresses;
|
||||
synchronized (addressInstances) {
|
||||
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
|
||||
if (addrInstance == null) {
|
||||
data.seek(bytes.addressesOffset);
|
||||
final long size;
|
||||
if (bytes.count % interval == 0) {
|
||||
size = bytes.count / interval;
|
||||
} else {
|
||||
size = 1L + bytes.count / interval;
|
||||
}
|
||||
addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
|
||||
addressInstances.put(field.number, addrInstance);
|
||||
}
|
||||
addresses = addrInstance;
|
||||
}
|
||||
|
||||
return new CompressedBinaryDocValues(bytes, addresses, data);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
final int valueCount = (int) binaries.get(field.number).count;
|
||||
|
@ -346,6 +398,24 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
public int getValueCount() {
|
||||
return valueCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int lookupTerm(BytesRef key) {
|
||||
if (binary instanceof CompressedBinaryDocValues) {
|
||||
return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key);
|
||||
} else {
|
||||
return super.lookupTerm(key);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum termsEnum() {
|
||||
if (binary instanceof CompressedBinaryDocValues) {
|
||||
return ((CompressedBinaryDocValues)binary).getTermsEnum();
|
||||
} else {
|
||||
return super.termsEnum();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -399,6 +469,24 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
public long getValueCount() {
|
||||
return valueCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long lookupTerm(BytesRef key) {
|
||||
if (binary instanceof CompressedBinaryDocValues) {
|
||||
return ((CompressedBinaryDocValues)binary).lookupTerm(key);
|
||||
} else {
|
||||
return super.lookupTerm(key);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum termsEnum() {
|
||||
if (binary instanceof CompressedBinaryDocValues) {
|
||||
return ((CompressedBinaryDocValues)binary).getTermsEnum();
|
||||
} else {
|
||||
return super.termsEnum();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -423,10 +511,12 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
static class BinaryEntry {
|
||||
long offset;
|
||||
|
||||
int format;
|
||||
long count;
|
||||
int minLength;
|
||||
int maxLength;
|
||||
long addressesOffset;
|
||||
long addressInterval;
|
||||
int packedIntsVersion;
|
||||
int blockSize;
|
||||
}
|
||||
|
@ -449,4 +539,204 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
abstract void get(long id, BytesRef Result);
|
||||
}
|
||||
|
||||
// in the compressed case, we add a few additional operations for
|
||||
// more efficient reverse lookup and enumeration
|
||||
static class CompressedBinaryDocValues extends LongBinaryDocValues {
|
||||
final BinaryEntry bytes;
|
||||
final long interval;
|
||||
final long numValues;
|
||||
final long numIndexValues;
|
||||
final MonotonicBlockPackedReader addresses;
|
||||
final IndexInput data;
|
||||
final TermsEnum termsEnum;
|
||||
|
||||
public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException {
|
||||
this.bytes = bytes;
|
||||
this.interval = bytes.addressInterval;
|
||||
this.addresses = addresses;
|
||||
this.data = data;
|
||||
this.numValues = bytes.count;
|
||||
this.numIndexValues = addresses.size();
|
||||
this.termsEnum = getTermsEnum(data);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long id, BytesRef result) {
|
||||
try {
|
||||
termsEnum.seekExact(id);
|
||||
BytesRef term = termsEnum.term();
|
||||
result.bytes = term.bytes;
|
||||
result.offset = term.offset;
|
||||
result.length = term.length;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
long lookupTerm(BytesRef key) {
|
||||
try {
|
||||
SeekStatus status = termsEnum.seekCeil(key);
|
||||
if (status == SeekStatus.END) {
|
||||
return -numValues-1;
|
||||
} else if (status == SeekStatus.FOUND) {
|
||||
return termsEnum.ord();
|
||||
} else {
|
||||
return -termsEnum.ord()-1;
|
||||
}
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
}
|
||||
|
||||
TermsEnum getTermsEnum() {
|
||||
try {
|
||||
return getTermsEnum(data.clone());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private TermsEnum getTermsEnum(final IndexInput input) throws IOException {
|
||||
input.seek(bytes.offset);
|
||||
|
||||
return new TermsEnum() {
|
||||
private long currentOrd = -1;
|
||||
// TODO: maxLength is negative when all terms are merged away...
|
||||
private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength);
|
||||
private final BytesRef term = new BytesRef(); // TODO: paranoia?
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
if (doNext() == null) {
|
||||
return null;
|
||||
} else {
|
||||
setTerm();
|
||||
return term;
|
||||
}
|
||||
}
|
||||
|
||||
private BytesRef doNext() throws IOException {
|
||||
if (++currentOrd >= numValues) {
|
||||
return null;
|
||||
} else {
|
||||
int start = input.readVInt();
|
||||
int suffix = input.readVInt();
|
||||
input.readBytes(termBuffer.bytes, start, suffix);
|
||||
termBuffer.length = start + suffix;
|
||||
return termBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
||||
// binary-search just the index values to find the block,
|
||||
// then scan within the block
|
||||
long low = 0;
|
||||
long high = numIndexValues-1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
doSeek(mid * interval);
|
||||
int cmp = termBuffer.compareTo(text);
|
||||
|
||||
if (cmp < 0) {
|
||||
low = mid + 1;
|
||||
} else if (cmp > 0) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
// we got lucky, found an indexed term
|
||||
setTerm();
|
||||
return SeekStatus.FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
if (numIndexValues == 0) {
|
||||
return SeekStatus.END;
|
||||
}
|
||||
|
||||
// block before insertion point
|
||||
long block = low-1;
|
||||
doSeek(block < 0 ? -1 : block * interval);
|
||||
|
||||
while (doNext() != null) {
|
||||
int cmp = termBuffer.compareTo(text);
|
||||
if (cmp == 0) {
|
||||
setTerm();
|
||||
return SeekStatus.FOUND;
|
||||
} else if (cmp > 0) {
|
||||
setTerm();
|
||||
return SeekStatus.NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
return SeekStatus.END;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seekExact(long ord) throws IOException {
|
||||
doSeek(ord);
|
||||
setTerm();
|
||||
}
|
||||
|
||||
private void doSeek(long ord) throws IOException {
|
||||
long block = ord / interval;
|
||||
|
||||
if (ord >= currentOrd && block == currentOrd / interval) {
|
||||
// seek within current block
|
||||
} else {
|
||||
// position before start of block
|
||||
currentOrd = ord - ord % interval - 1;
|
||||
input.seek(bytes.offset + addresses.get(block));
|
||||
}
|
||||
|
||||
while (currentOrd < ord) {
|
||||
doNext();
|
||||
}
|
||||
}
|
||||
|
||||
private void setTerm() {
|
||||
// TODO: is there a cleaner way
|
||||
term.bytes = new byte[termBuffer.length];
|
||||
term.offset = 0;
|
||||
term.copyBytes(termBuffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() throws IOException {
|
||||
return term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() throws IOException {
|
||||
return currentOrd;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docFreq() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -78,5 +78,10 @@ public final class MonotonicBlockPackedReader {
|
|||
final int idx = (int) (index & blockMask);
|
||||
return minValues[block] + (long) (idx * averages[block]) + zigZagDecode(subReaders[block].get(idx));
|
||||
}
|
||||
|
||||
/** Returns the number of values */
|
||||
public long size() {
|
||||
return valueCount;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,8 +24,10 @@ import org.apache.lucene.codecs.DocValuesProducer;
|
|||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
|
||||
import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* DocValues format that keeps everything on disk.
|
||||
|
@ -53,7 +55,13 @@ public final class CheapBastardDocValuesFormat extends DocValuesFormat {
|
|||
return new DiskDocValuesConsumer(state, DiskDocValuesFormat.DATA_CODEC,
|
||||
DiskDocValuesFormat.DATA_EXTENSION,
|
||||
DiskDocValuesFormat.META_CODEC,
|
||||
DiskDocValuesFormat.META_EXTENSION);
|
||||
DiskDocValuesFormat.META_EXTENSION) {
|
||||
// don't ever write an index, we dont want to use RAM :)
|
||||
@Override
|
||||
protected void addTermsDict(FieldInfo field, Iterable<BytesRef> values) throws IOException {
|
||||
addBinaryField(field, values);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
|
||||
import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
|
@ -58,7 +59,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
|
|||
final int version;
|
||||
try {
|
||||
version = CodecUtil.checkHeader(in, metaCodec,
|
||||
DiskDocValuesFormat.VERSION_START,
|
||||
DiskDocValuesFormat.VERSION_CURRENT,
|
||||
DiskDocValuesFormat.VERSION_CURRENT);
|
||||
numerics = new HashMap<Integer,NumericEntry>();
|
||||
ords = new HashMap<Integer,NumericEntry>();
|
||||
|
@ -80,7 +81,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
|
|||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||
data = state.directory.openInput(dataName, state.context);
|
||||
final int version2 = CodecUtil.checkHeader(data, dataCodec,
|
||||
DiskDocValuesFormat.VERSION_START,
|
||||
DiskDocValuesFormat.VERSION_CURRENT,
|
||||
DiskDocValuesFormat.VERSION_CURRENT);
|
||||
if (version != version2) {
|
||||
throw new CorruptIndexException("Versions mismatch");
|
||||
|
@ -193,6 +194,10 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
|
||||
BinaryEntry entry = new BinaryEntry();
|
||||
int format = meta.readVInt();
|
||||
if (format != DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) {
|
||||
throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta);
|
||||
}
|
||||
entry.minLength = meta.readVInt();
|
||||
entry.maxLength = meta.readVInt();
|
||||
entry.count = meta.readVLong();
|
||||
|
|
|
@ -1350,6 +1350,57 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
private void doTestSortedVsFieldCache(int minLength, int maxLength) throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||
Document doc = new Document();
|
||||
Field idField = new StringField("id", "", Field.Store.NO);
|
||||
Field indexedField = new StringField("indexed", "", Field.Store.NO);
|
||||
Field dvField = new SortedDocValuesField("dv", new BytesRef());
|
||||
doc.add(idField);
|
||||
doc.add(indexedField);
|
||||
doc.add(dvField);
|
||||
|
||||
// index some docs
|
||||
int numDocs = atLeast(300);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
idField.setStringValue(Integer.toString(i));
|
||||
final int length;
|
||||
if (minLength == maxLength) {
|
||||
length = minLength; // fixed length
|
||||
} else {
|
||||
length = _TestUtil.nextInt(random(), minLength, maxLength);
|
||||
}
|
||||
String value = _TestUtil.randomSimpleString(random(), length);
|
||||
indexedField.setStringValue(value);
|
||||
dvField.setBytesValue(new BytesRef(value));
|
||||
writer.addDocument(doc);
|
||||
if (random().nextInt(31) == 0) {
|
||||
writer.commit();
|
||||
}
|
||||
}
|
||||
|
||||
// delete some docs
|
||||
int numDeletions = random().nextInt(numDocs/10);
|
||||
for (int i = 0; i < numDeletions; i++) {
|
||||
int id = random().nextInt(numDocs);
|
||||
writer.deleteDocuments(new Term("id", Integer.toString(id)));
|
||||
}
|
||||
writer.close();
|
||||
|
||||
// compare
|
||||
DirectoryReader ir = DirectoryReader.open(dir);
|
||||
for (AtomicReaderContext context : ir.leaves()) {
|
||||
AtomicReader r = context.reader();
|
||||
SortedDocValues expected = FieldCache.DEFAULT.getTermsIndex(r, "indexed");
|
||||
SortedDocValues actual = r.getSortedDocValues("dv");
|
||||
assertEquals(r.maxDoc(), expected, actual);
|
||||
}
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSortedFixedLengthVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
|
@ -1358,6 +1409,21 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testSortedFixedLengthVsFieldCache() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
int fixedLength = _TestUtil.nextInt(random(), 1, 10);
|
||||
doTestSortedVsFieldCache(fixedLength, fixedLength);
|
||||
}
|
||||
}
|
||||
|
||||
public void testSortedVariableLengthVsFieldCache() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
doTestSortedVsFieldCache(1, 10);
|
||||
}
|
||||
}
|
||||
|
||||
public void testSortedVariableLengthVsStoredFields() throws Exception {
|
||||
int numIterations = atLeast(1);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
|
@ -1905,6 +1971,10 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception {
|
||||
assertEquals(maxDoc, new SingletonSortedSetDocValues(expected), new SingletonSortedSetDocValues(actual));
|
||||
}
|
||||
|
||||
private void assertEquals(int maxDoc, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception {
|
||||
// can be null for the segment if no docs actually had any SortedDocValues
|
||||
// in this case FC.getDocTermsOrds returns EMPTY
|
||||
|
@ -1932,6 +2002,74 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
|||
actual.lookupTerm(actualBytes);
|
||||
assertEquals(expectedBytes, actualBytes);
|
||||
}
|
||||
|
||||
// compare termsenum
|
||||
assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum());
|
||||
}
|
||||
|
||||
private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
|
||||
BytesRef ref;
|
||||
|
||||
// sequential next() through all terms
|
||||
while ((ref = expected.next()) != null) {
|
||||
assertEquals(ref, actual.next());
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
assertNull(actual.next());
|
||||
|
||||
// sequential seekExact(ord) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
actual.seekExact(i);
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// sequential seekExact(BytesRef) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
assertTrue(actual.seekExact(expected.term()));
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// sequential seekCeil(BytesRef) through all terms
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
expected.seekExact(i);
|
||||
assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekExact(ord)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
|
||||
expected.seekExact(randomOrd);
|
||||
actual.seekExact(randomOrd);
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekExact(BytesRef)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
|
||||
expected.seekExact(randomOrd);
|
||||
actual.seekExact(expected.term());
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
|
||||
// random seekCeil(BytesRef)
|
||||
for (long i = 0; i < numOrds; i++) {
|
||||
BytesRef target = new BytesRef(_TestUtil.randomUnicodeString(random()));
|
||||
SeekStatus expectedStatus = expected.seekCeil(target);
|
||||
assertEquals(expectedStatus, actual.seekCeil(target));
|
||||
if (expectedStatus != SeekStatus.END) {
|
||||
assertEquals(expected.ord(), actual.ord());
|
||||
assertEquals(expected.term(), actual.term());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestSortedSetVsUninvertedField(int minLength, int maxLength) throws Exception {
|
||||
|
|
Loading…
Reference in New Issue