4.0 VAR_DEREF

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1438119 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-24 18:31:28 +00:00
parent b8c2238a46
commit 48719dcfba
2 changed files with 98 additions and 4 deletions

View File

@ -401,7 +401,50 @@ class Lucene40DocValuesReader extends DocValuesProducer {
} }
private BinaryDocValues loadBytesVarDeref(FieldInfo field) throws IOException { private BinaryDocValues loadBytesVarDeref(FieldInfo field) throws IOException {
throw new AssertionError(); // nocommit String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "dat");
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "idx");
IndexInput data = null;
IndexInput index = null;
boolean success = false;
try {
data = dir.openInput(dataName, state.context);
CodecUtil.checkHeader(data, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_START,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
index = dir.openInput(indexName, state.context);
CodecUtil.checkHeader(index, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_START,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
final long totalBytes = index.readLong();
// nocommit? can the current impl even handle > 2G?
final byte bytes[] = new byte[(int)totalBytes];
data.readBytes(bytes, 0, bytes.length);
final PackedInts.Reader reader = PackedInts.getReader(index);
success = true;
return new BinaryDocValues() {
@Override
public void get(int docID, BytesRef result) {
int startAddress = (int)reader.get(docID);
result.bytes = bytes;
result.offset = startAddress;
if ((bytes[startAddress] & 128) == 0) {
// length is 1 byte
result.offset++;
result.length = bytes[startAddress];
} else {
result.offset += 2;
result.length = ((bytes[startAddress] & 0x7f) << 8) | ((bytes[startAddress+1] & 0xff));
}
}
};
} finally {
if (success) {
IOUtils.close(data, index);
} else {
IOUtils.closeWhileHandlingException(data, index);
}
}
} }
@Override @Override

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene40;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.TreeSet; import java.util.TreeSet;
@ -168,8 +169,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
int maxDoc = state.segmentInfo.getDocCount(); int maxDoc = state.segmentInfo.getDocCount();
final boolean fixed = minLength == maxLength; final boolean fixed = minLength == maxLength;
// nocommit final boolean dedup = uniqueValues != null && uniqueValues.size() * 2 < maxDoc;
final boolean dedup = fixed && (uniqueValues != null && uniqueValues.size() * 2 < maxDoc);
if (dedup) { if (dedup) {
// we will deduplicate and deref values // we will deduplicate and deref values
@ -184,7 +184,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
if (fixed) { if (fixed) {
addFixedDerefBytesField(field, data, index, values, minLength); addFixedDerefBytesField(field, data, index, values, minLength);
} else { } else {
assert false; // nocommit addVarDerefBytesField(field, data, index, values);
} }
success = true; success = true;
} finally { } finally {
@ -324,6 +324,57 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
w.finish(); w.finish();
} }
private void addVarDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_DEREF.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
// deduplicate
TreeSet<BytesRef> dictionary = new TreeSet<BytesRef>();
for (BytesRef v : values) {
dictionary.add(BytesRef.deepCopyOf(v));
}
/* values */
long startPosition = data.getFilePointer();
long currentAddress = 0;
HashMap<BytesRef,Long> valueToAddress = new HashMap<BytesRef,Long>();
for (BytesRef v : dictionary) {
currentAddress = data.getFilePointer() - startPosition;
valueToAddress.put(v, currentAddress);
writeVShort(data, v.length);
data.writeBytes(v.bytes, v.offset, v.length);
}
/* ordinals */
long totalBytes = data.getFilePointer() - startPosition;
index.writeLong(totalBytes);
final int maxDoc = state.segmentInfo.getDocCount();
final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(currentAddress), PackedInts.DEFAULT);
for (BytesRef v : values) {
w.add(valueToAddress.get(v));
}
w.finish();
}
// the little vint encoding used for var-deref
private static void writeVShort(IndexOutput o, int i) throws IOException {
assert i >= 0 && i <= Short.MAX_VALUE;
if (i < 128) {
o.writeByte((byte)i);
} else {
o.writeByte((byte) (0x80 | (i >> 8)));
o.writeByte((byte) (i & 0xff));
}
}
@Override @Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException { public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
// examine the values to determine best type to use // examine the values to determine best type to use