mirror of https://github.com/apache/lucene.git
4.0 VAR_DEREF
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1438119 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b8c2238a46
commit
48719dcfba
|
@ -401,7 +401,50 @@ class Lucene40DocValuesReader extends DocValuesProducer {
|
|||
}
|
||||
|
||||
private BinaryDocValues loadBytesVarDeref(FieldInfo field) throws IOException {
|
||||
throw new AssertionError(); // nocommit
|
||||
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "dat");
|
||||
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "idx");
|
||||
IndexInput data = null;
|
||||
IndexInput index = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
data = dir.openInput(dataName, state.context);
|
||||
CodecUtil.checkHeader(data, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_START,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
|
||||
index = dir.openInput(indexName, state.context);
|
||||
CodecUtil.checkHeader(index, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_START,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
|
||||
|
||||
final long totalBytes = index.readLong();
|
||||
// nocommit? can the current impl even handle > 2G?
|
||||
final byte bytes[] = new byte[(int)totalBytes];
|
||||
data.readBytes(bytes, 0, bytes.length);
|
||||
final PackedInts.Reader reader = PackedInts.getReader(index);
|
||||
success = true;
|
||||
return new BinaryDocValues() {
|
||||
@Override
|
||||
public void get(int docID, BytesRef result) {
|
||||
int startAddress = (int)reader.get(docID);
|
||||
result.bytes = bytes;
|
||||
result.offset = startAddress;
|
||||
if ((bytes[startAddress] & 128) == 0) {
|
||||
// length is 1 byte
|
||||
result.offset++;
|
||||
result.length = bytes[startAddress];
|
||||
} else {
|
||||
result.offset += 2;
|
||||
result.length = ((bytes[startAddress] & 0x7f) << 8) | ((bytes[startAddress+1] & 0xff));
|
||||
}
|
||||
}
|
||||
};
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(data, index);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(data, index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene40;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
@ -168,8 +169,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
|
|||
|
||||
int maxDoc = state.segmentInfo.getDocCount();
|
||||
final boolean fixed = minLength == maxLength;
|
||||
// nocommit
|
||||
final boolean dedup = fixed && (uniqueValues != null && uniqueValues.size() * 2 < maxDoc);
|
||||
final boolean dedup = uniqueValues != null && uniqueValues.size() * 2 < maxDoc;
|
||||
|
||||
if (dedup) {
|
||||
// we will deduplicate and deref values
|
||||
|
@ -184,7 +184,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
|
|||
if (fixed) {
|
||||
addFixedDerefBytesField(field, data, index, values, minLength);
|
||||
} else {
|
||||
assert false; // nocommit
|
||||
addVarDerefBytesField(field, data, index, values);
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
|
@ -323,6 +323,57 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
|
|||
}
|
||||
w.finish();
|
||||
}
|
||||
|
||||
private void addVarDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values) throws IOException {
|
||||
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_DEREF.name());
|
||||
|
||||
CodecUtil.writeHeader(data,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
|
||||
|
||||
CodecUtil.writeHeader(index,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX,
|
||||
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
|
||||
|
||||
// deduplicate
|
||||
TreeSet<BytesRef> dictionary = new TreeSet<BytesRef>();
|
||||
for (BytesRef v : values) {
|
||||
dictionary.add(BytesRef.deepCopyOf(v));
|
||||
}
|
||||
|
||||
/* values */
|
||||
long startPosition = data.getFilePointer();
|
||||
long currentAddress = 0;
|
||||
HashMap<BytesRef,Long> valueToAddress = new HashMap<BytesRef,Long>();
|
||||
for (BytesRef v : dictionary) {
|
||||
currentAddress = data.getFilePointer() - startPosition;
|
||||
valueToAddress.put(v, currentAddress);
|
||||
writeVShort(data, v.length);
|
||||
data.writeBytes(v.bytes, v.offset, v.length);
|
||||
}
|
||||
|
||||
/* ordinals */
|
||||
long totalBytes = data.getFilePointer() - startPosition;
|
||||
index.writeLong(totalBytes);
|
||||
final int maxDoc = state.segmentInfo.getDocCount();
|
||||
final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(currentAddress), PackedInts.DEFAULT);
|
||||
|
||||
for (BytesRef v : values) {
|
||||
w.add(valueToAddress.get(v));
|
||||
}
|
||||
w.finish();
|
||||
}
|
||||
|
||||
// the little vint encoding used for var-deref
|
||||
private static void writeVShort(IndexOutput o, int i) throws IOException {
|
||||
assert i >= 0 && i <= Short.MAX_VALUE;
|
||||
if (i < 128) {
|
||||
o.writeByte((byte)i);
|
||||
} else {
|
||||
o.writeByte((byte) (0x80 | (i >> 8)));
|
||||
o.writeByte((byte) (i & 0xff));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
|
||||
|
|
Loading…
Reference in New Issue