mirror of https://github.com/apache/lucene.git
add on-disk SortedSet impl
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446903 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
156e036022
commit
b69a50fb12
|
@ -59,7 +59,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
|
||||
@Override
|
||||
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
int count = 0;
|
||||
long count = 0;
|
||||
for (@SuppressWarnings("unused") Number nv : values) {
|
||||
++count;
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
meta.writeByte(DiskDocValuesFormat.NUMERIC);
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVInt(count);
|
||||
meta.writeVLong(count);
|
||||
meta.writeVInt(BLOCK_SIZE);
|
||||
|
||||
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
|
||||
|
@ -86,7 +86,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
int minLength = Integer.MAX_VALUE;
|
||||
int maxLength = Integer.MIN_VALUE;
|
||||
final long startFP = data.getFilePointer();
|
||||
int count = 0;
|
||||
long count = 0;
|
||||
for(BytesRef v : values) {
|
||||
minLength = Math.min(minLength, v.length);
|
||||
maxLength = Math.max(maxLength, v.length);
|
||||
|
@ -95,7 +95,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
}
|
||||
meta.writeVInt(minLength);
|
||||
meta.writeVInt(maxLength);
|
||||
meta.writeVInt(count);
|
||||
meta.writeVLong(count);
|
||||
meta.writeLong(startFP);
|
||||
|
||||
// if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
|
||||
|
@ -125,7 +125,28 @@ public class DiskDocValuesConsumer extends DocValuesConsumer {
|
|||
|
||||
@Override
|
||||
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
|
||||
throw new UnsupportedOperationException(); // nocommit
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(DiskDocValuesFormat.SORTED_SET);
|
||||
// write the ord -> byte[] as a binary field
|
||||
addBinaryField(field, values);
|
||||
// write the stream of ords as a numeric field
|
||||
addNumericField(field, ords);
|
||||
|
||||
// write the doc -> ord count as a absolute index to the stream
|
||||
meta.writeVInt(field.number);
|
||||
meta.writeByte(DiskDocValuesFormat.NUMERIC);
|
||||
meta.writeVInt(PackedInts.VERSION_CURRENT);
|
||||
meta.writeLong(data.getFilePointer());
|
||||
meta.writeVLong(maxDoc);
|
||||
meta.writeVInt(BLOCK_SIZE);
|
||||
|
||||
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
|
||||
long addr = 0;
|
||||
for (Number v : docToOrdCount) {
|
||||
addr += v.longValue();
|
||||
writer.add(addr);
|
||||
}
|
||||
writer.finish();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -58,4 +58,5 @@ public final class DiskDocValuesFormat extends DocValuesFormat {
|
|||
public static final byte NUMERIC = 0;
|
||||
public static final byte BINARY = 1;
|
||||
public static final byte SORTED = 2;
|
||||
public static final byte SORTED_SET = 3;
|
||||
}
|
||||
|
|
|
@ -42,11 +42,13 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
private final Map<Integer,NumericEntry> numerics;
|
||||
private final Map<Integer,BinaryEntry> binaries;
|
||||
private final Map<Integer,NumericEntry> ords;
|
||||
private final Map<Integer,NumericEntry> ordIndexes;
|
||||
private final IndexInput data;
|
||||
|
||||
// memory-resident structures
|
||||
private final Map<Integer,BlockPackedReader> ordinalInstances = new HashMap<Integer,BlockPackedReader>();
|
||||
private final Map<Integer,MonotonicBlockPackedReader> addressInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
|
||||
private final Map<Integer,MonotonicBlockPackedReader> ordIndexInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
|
||||
|
||||
DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
|
@ -59,6 +61,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
DiskDocValuesFormat.VERSION_START);
|
||||
numerics = new HashMap<Integer,NumericEntry>();
|
||||
ords = new HashMap<Integer,NumericEntry>();
|
||||
ordIndexes = new HashMap<Integer,NumericEntry>();
|
||||
binaries = new HashMap<Integer,BinaryEntry>();
|
||||
readFields(in, state.fieldInfos);
|
||||
success = true;
|
||||
|
@ -105,6 +108,36 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
NumericEntry n = readNumericEntry(meta);
|
||||
ords.put(fieldNumber, n);
|
||||
} else if (type == DiskDocValuesFormat.SORTED_SET) {
|
||||
// sortedset = binary + numeric + ordIndex
|
||||
if (meta.readVInt() != fieldNumber) {
|
||||
throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
|
||||
}
|
||||
if (meta.readByte() != DiskDocValuesFormat.BINARY) {
|
||||
throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
|
||||
}
|
||||
BinaryEntry b = readBinaryEntry(meta);
|
||||
binaries.put(fieldNumber, b);
|
||||
|
||||
if (meta.readVInt() != fieldNumber) {
|
||||
throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
|
||||
}
|
||||
if (meta.readByte() != DiskDocValuesFormat.NUMERIC) {
|
||||
throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
|
||||
}
|
||||
NumericEntry n1 = readNumericEntry(meta);
|
||||
ords.put(fieldNumber, n1);
|
||||
|
||||
if (meta.readVInt() != fieldNumber) {
|
||||
throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
|
||||
}
|
||||
if (meta.readByte() != DiskDocValuesFormat.NUMERIC) {
|
||||
throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
|
||||
}
|
||||
NumericEntry n2 = readNumericEntry(meta);
|
||||
ordIndexes.put(fieldNumber, n2);
|
||||
} else {
|
||||
throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta);
|
||||
}
|
||||
fieldNumber = meta.readVInt();
|
||||
}
|
||||
|
@ -114,7 +147,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
NumericEntry entry = new NumericEntry();
|
||||
entry.packedIntsVersion = meta.readVInt();
|
||||
entry.offset = meta.readLong();
|
||||
entry.count = meta.readVInt();
|
||||
entry.count = meta.readVLong();
|
||||
entry.blockSize = meta.readVInt();
|
||||
return entry;
|
||||
}
|
||||
|
@ -123,7 +156,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
BinaryEntry entry = new BinaryEntry();
|
||||
entry.minLength = meta.readVInt();
|
||||
entry.maxLength = meta.readVInt();
|
||||
entry.count = meta.readVInt();
|
||||
entry.count = meta.readVLong();
|
||||
entry.offset = meta.readLong();
|
||||
if (entry.minLength != entry.maxLength) {
|
||||
entry.addressesOffset = meta.readLong();
|
||||
|
@ -136,14 +169,18 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
NumericEntry entry = numerics.get(field.number);
|
||||
return getNumeric(entry);
|
||||
}
|
||||
|
||||
LongNumericDocValues getNumeric(NumericEntry entry) throws IOException {
|
||||
final IndexInput data = this.data.clone();
|
||||
data.seek(entry.offset);
|
||||
|
||||
final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
|
||||
return new NumericDocValues() {
|
||||
return new LongNumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return reader.get(docID);
|
||||
public long get(long id) {
|
||||
return reader.get(id);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -161,10 +198,10 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) {
|
||||
final IndexInput data = this.data.clone();
|
||||
|
||||
return new BinaryDocValues() {
|
||||
return new LongBinaryDocValues() {
|
||||
@Override
|
||||
public void get(int docID, BytesRef result) {
|
||||
long address = bytes.offset + docID * (long)bytes.maxLength;
|
||||
public void get(long id, BytesRef result) {
|
||||
long address = bytes.offset + id * bytes.maxLength;
|
||||
try {
|
||||
data.seek(address);
|
||||
// NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource)
|
||||
|
@ -195,11 +232,11 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
addresses = addrInstance;
|
||||
}
|
||||
|
||||
return new BinaryDocValues() {
|
||||
return new LongBinaryDocValues() {
|
||||
@Override
|
||||
public void get(int docID, BytesRef result) {
|
||||
long startAddress = bytes.offset + (docID == 0 ? 0 : + addresses.get(docID-1));
|
||||
long endAddress = bytes.offset + addresses.get(docID);
|
||||
public void get(long id, BytesRef result) {
|
||||
long startAddress = bytes.offset + (id == 0 ? 0 : addresses.get(id-1));
|
||||
long endAddress = bytes.offset + addresses.get(id);
|
||||
int length = (int) (endAddress - startAddress);
|
||||
try {
|
||||
data.seek(startAddress);
|
||||
|
@ -219,7 +256,7 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
final int valueCount = binaries.get(field.number).count;
|
||||
final int valueCount = (int) binaries.get(field.number).count;
|
||||
final BinaryDocValues binary = getBinary(field);
|
||||
final BlockPackedReader ordinals;
|
||||
synchronized (ordinalInstances) {
|
||||
|
@ -254,7 +291,54 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
throw new UnsupportedOperationException(); // nocommit
|
||||
final long valueCount = binaries.get(field.number).count;
|
||||
// we keep the byte[]s and list of ords on disk, these could be large
|
||||
final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
|
||||
final LongNumericDocValues ordinals = getNumeric(ords.get(field.number));
|
||||
// but the addresses to the ord stream are in RAM
|
||||
final MonotonicBlockPackedReader ordIndex;
|
||||
synchronized (ordIndexInstances) {
|
||||
MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
|
||||
if (ordIndexInstance == null) {
|
||||
NumericEntry entry = ordIndexes.get(field.number);
|
||||
IndexInput data = this.data.clone();
|
||||
data.seek(entry.offset);
|
||||
ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
|
||||
}
|
||||
ordIndex = ordIndexInstance;
|
||||
}
|
||||
|
||||
return new SortedSetDocValues() {
|
||||
long offset;
|
||||
long endOffset;
|
||||
|
||||
@Override
|
||||
public long nextOrd() {
|
||||
if (offset == endOffset) {
|
||||
return NO_MORE_ORDS;
|
||||
} else {
|
||||
long ord = ordinals.get(offset);
|
||||
offset++;
|
||||
return ord;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDocument(int docID) {
|
||||
offset = (docID == 0 ? 0 : ordIndex.get(docID-1));
|
||||
endOffset = ordIndex.get(docID);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lookupOrd(long ord, BytesRef result) {
|
||||
binary.get(ord, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getValueCount() {
|
||||
return valueCount;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -266,18 +350,37 @@ class DiskDocValuesProducer extends DocValuesProducer {
|
|||
long offset;
|
||||
|
||||
int packedIntsVersion;
|
||||
int count;
|
||||
long count;
|
||||
int blockSize;
|
||||
}
|
||||
|
||||
static class BinaryEntry {
|
||||
long offset;
|
||||
|
||||
int count;
|
||||
long count;
|
||||
int minLength;
|
||||
int maxLength;
|
||||
long addressesOffset;
|
||||
int packedIntsVersion;
|
||||
int blockSize;
|
||||
}
|
||||
|
||||
// internally we compose complex dv (sorted/sortedset) from other ones
|
||||
static abstract class LongNumericDocValues extends NumericDocValues {
|
||||
@Override
|
||||
public final long get(int docID) {
|
||||
return get((long) docID);
|
||||
}
|
||||
|
||||
abstract long get(long id);
|
||||
}
|
||||
|
||||
static abstract class LongBinaryDocValues extends BinaryDocValues {
|
||||
@Override
|
||||
public final void get(int docID, BytesRef result) {
|
||||
get((long)docID, result);
|
||||
}
|
||||
|
||||
abstract void get(long id, BytesRef Result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,8 +25,6 @@ import java.util.TreeSet;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
|
@ -42,6 +40,7 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
||||
|
@ -52,23 +51,10 @@ import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
|||
* Please try to keep src/java/overview.html up-to-date when making changes
|
||||
* to this class.
|
||||
*/
|
||||
// nocommit: should only be Lucene40 and Lucene41
|
||||
@SuppressCodecs({ "Lucene40", "Lucene41", "SimpleText", "CheapBastard" })
|
||||
public class TestDemoDocValue extends LuceneTestCase {
|
||||
|
||||
// nocommit: only Lucene42/Asserting implemented right now
|
||||
private Codec saved;
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
saved = Codec.getDefault();
|
||||
Codec.setDefault(_TestUtil.alwaysDocValuesFormat(DocValuesFormat.forName("Asserting")));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
Codec.setDefault(saved);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testOneValue() throws IOException {
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory);
|
||||
|
|
|
@ -110,7 +110,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
|
|||
NumericEntry entry = new NumericEntry();
|
||||
entry.packedIntsVersion = meta.readVInt();
|
||||
entry.offset = meta.readLong();
|
||||
entry.count = meta.readVInt();
|
||||
entry.count = meta.readVLong();
|
||||
entry.blockSize = meta.readVInt();
|
||||
return entry;
|
||||
}
|
||||
|
@ -119,7 +119,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
|
|||
BinaryEntry entry = new BinaryEntry();
|
||||
entry.minLength = meta.readVInt();
|
||||
entry.maxLength = meta.readVInt();
|
||||
entry.count = meta.readVInt();
|
||||
entry.count = meta.readVLong();
|
||||
entry.offset = meta.readLong();
|
||||
if (entry.minLength != entry.maxLength) {
|
||||
entry.addressesOffset = meta.readLong();
|
||||
|
@ -210,7 +210,7 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
final int valueCount = binaries.get(field.number).count;
|
||||
final int valueCount = (int) binaries.get(field.number).count;
|
||||
final BinaryDocValues binary = getBinary(field);
|
||||
final NumericDocValues ordinals = getNumeric(field, ords.get(field.number));
|
||||
return new SortedDocValues() {
|
||||
|
@ -246,14 +246,14 @@ class CheapBastardDocValuesProducer extends DocValuesProducer {
|
|||
long offset;
|
||||
|
||||
int packedIntsVersion;
|
||||
int count;
|
||||
long count;
|
||||
int blockSize;
|
||||
}
|
||||
|
||||
static class BinaryEntry {
|
||||
long offset;
|
||||
|
||||
int count;
|
||||
long count;
|
||||
int minLength;
|
||||
int maxLength;
|
||||
long addressesOffset;
|
||||
|
|
|
@ -154,8 +154,12 @@ public class RandomCodec extends Lucene42Codec {
|
|||
Collections.shuffle(dvFormats, random);
|
||||
|
||||
// Avoid too many open files:
|
||||
formats = formats.subList(0, 4);
|
||||
dvFormats = dvFormats.subList(0, 4);
|
||||
if (formats.size() > 4) {
|
||||
formats = formats.subList(0, 4);
|
||||
}
|
||||
if (dvFormats.size() > 4) {
|
||||
dvFormats = dvFormats.subList(0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
public RandomCodec(Random random) {
|
||||
|
|
Loading…
Reference in New Issue