4.0 sortedbytes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1438072 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-24 17:01:10 +00:00
parent e9ea070f5b
commit 7e97250ac8
9 changed files with 246 additions and 118 deletions

View File

@ -87,7 +87,7 @@ public class Lucene40Codec extends Codec {
return infosFormat;
}
private final DocValuesFormat defaultDVFormat = new Lucene40LyingDocValuesFormat();
private final DocValuesFormat defaultDVFormat = new Lucene40DocValuesFormat();
@Override
public DocValuesFormat docValuesFormat() {

View File

@ -51,7 +51,6 @@ public class Lucene40DocValuesFormat extends DocValuesFormat {
static final String VAR_INTS_CODEC_NAME = "PackedInts";
static final int VAR_INTS_VERSION_START = 0;
static final int VAR_INTS_VERSION_CURRENT = VAR_INTS_VERSION_START;
static final byte VAR_INTS_PACKED = 0x00;
static final byte VAR_INTS_FIXED_64 = 0x01;
@ -75,4 +74,17 @@ public class Lucene40DocValuesFormat extends DocValuesFormat {
static final String BYTES_VAR_STRAIGHT_CODEC_NAME_DAT = "VarStraightBytesDat";
static final int BYTES_VAR_STRAIGHT_VERSION_START = 0;
static final int BYTES_VAR_STRAIGHT_VERSION_CURRENT = BYTES_VAR_STRAIGHT_VERSION_START;
// constants for BYTES_FIXED_SORTED
static final String BYTES_FIXED_SORTED_CODEC_NAME_IDX = "FixedSortedBytesIdx";
static final String BYTES_FIXED_SORTED_CODEC_NAME_DAT = "FixedSortedBytesDat";
static final int BYTES_FIXED_SORTED_VERSION_START = 0;
static final int BYTES_FIXED_SORTED_VERSION_CURRENT = BYTES_FIXED_SORTED_VERSION_START;
// constants for BYTES_VAR_SORTED
// NOTE THIS IS NOT A BUG! 4.0 actually screwed this up (VAR_SORTED and VAR_DEREF have same codec header)
static final String BYTES_VAR_SORTED_CODEC_NAME_IDX = "VarDerefBytesIdx";
static final String BYTES_VAR_SORTED_CODEC_NAME_DAT = "VarDerefBytesDat";
static final int BYTES_VAR_SORTED_VERSION_START = 0;
static final int BYTES_VAR_SORTED_VERSION_CURRENT = BYTES_VAR_SORTED_VERSION_START;
}

View File

@ -276,7 +276,7 @@ class Lucene40DocValuesReader extends DocValuesProducer {
instance = loadBytesVarStraight(field);
break;
default:
throw new AssertionError();
throw new AssertionError(); // nocommit
}
binaryInstances.put(field.number, instance);
}
@ -355,7 +355,113 @@ class Lucene40DocValuesReader extends DocValuesProducer {
@Override
public synchronized SortedDocValues getSorted(FieldInfo field) throws IOException {
throw new AssertionError();
SortedDocValues instance = sortedInstances.get(field.number);
if (instance == null) {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "dat");
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "idx");
IndexInput data = null;
IndexInput index = null;
boolean success = false;
try {
data = dir.openInput(dataName, state.context);
index = dir.openInput(indexName, state.context);
switch(LegacyDocValuesType.valueOf(field.getAttribute(legacyKey))) {
case BYTES_FIXED_SORTED:
instance = loadBytesFixedSorted(field, data, index);
break;
case BYTES_VAR_SORTED:
instance = loadBytesVarSorted(field, data, index);
break;
default:
throw new AssertionError();
}
success = true;
} finally {
if (success) {
IOUtils.close(data, index);
} else {
IOUtils.closeWhileHandlingException(data, index);
}
}
sortedInstances.put(field.number, instance);
}
return instance;
}
private SortedDocValues loadBytesFixedSorted(FieldInfo field, IndexInput data, IndexInput index) throws IOException {
CodecUtil.checkHeader(data, Lucene40DocValuesFormat.BYTES_FIXED_SORTED_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_START,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_CURRENT);
CodecUtil.checkHeader(index, Lucene40DocValuesFormat.BYTES_FIXED_SORTED_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_START,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_CURRENT);
final int fixedLength = data.readInt();
final int valueCount = index.readInt();
// nocommit? can the current impl even handle > 2G?
final byte[] bytes = new byte[fixedLength*valueCount];
data.readBytes(bytes, 0, bytes.length);
final PackedInts.Reader reader = PackedInts.getReader(index);
return new SortedDocValues() {
@Override
public int getOrd(int docID) {
return (int) reader.get(docID);
}
@Override
public void lookupOrd(int ord, BytesRef result) {
result.bytes = bytes;
result.offset = ord * fixedLength;
result.length = fixedLength;
}
@Override
public int getValueCount() {
return valueCount;
}
};
}
private SortedDocValues loadBytesVarSorted(FieldInfo field, IndexInput data, IndexInput index) throws IOException {
CodecUtil.checkHeader(data, Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_START,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);
CodecUtil.checkHeader(index, Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_START,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);
long maxAddress = index.readLong();
// nocommit? can the current impl even handle > 2G?
final byte[] bytes = new byte[(int)maxAddress];
data.readBytes(bytes, 0, bytes.length);
final PackedInts.Reader addressReader = PackedInts.getReader(index);
final PackedInts.Reader ordsReader = PackedInts.getReader(index);
final int valueCount = addressReader.size() - 1;
return new SortedDocValues() {
@Override
public int getOrd(int docID) {
return (int)ordsReader.get(docID);
}
@Override
public void lookupOrd(int ord, BytesRef result) {
long startAddress = addressReader.get(ord);
long endAddress = addressReader.get(ord+1);
result.bytes = bytes;
result.offset = (int)startAddress;
result.length = (int)(endAddress - startAddress);
}
@Override
public int getValueCount() {
return valueCount;
}
};
}
@Override

View File

@ -1,41 +0,0 @@
package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
// nocommit: still a lie, but allows javadocs @links to work
// nocommit: make read-only and move to impersonator
public class Lucene40LyingDocValuesFormat extends Lucene42DocValuesFormat {
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
throw new UnsupportedOperationException("this codec can only be used for reading");
}
@Override
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return super.fieldsProducer(state);
}
}

View File

@ -32,7 +32,7 @@ import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.codecs.lucene40.Lucene40LyingDocValuesFormat;
import org.apache.lucene.codecs.lucene40.Lucene40DocValuesFormat;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosFormat;
import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40NormsFormat;
@ -125,7 +125,7 @@ public class Lucene41Codec extends Codec {
}
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
private final DocValuesFormat dvFormat = new Lucene40LyingDocValuesFormat();
private final DocValuesFormat dvFormat = new Lucene40DocValuesFormat();
private final NormsFormat normsFormat = new Lucene40NormsFormat();
@Override

View File

@ -219,12 +219,16 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_VERSION_CURRENT);
/* values */
final long startPos = data.getFilePointer();
for (BytesRef v : values) {
data.writeBytes(v.bytes, v.offset, v.length);
}
/* addresses */
final long maxAddress = data.getFilePointer() - startPos;
index.writeVLong(maxAddress);
@ -245,8 +249,121 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
assert false;
}
// examine the values to determine best type to use
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
for (BytesRef b : values) {
minLength = Math.min(minLength, b.length);
maxLength = Math.max(maxLength, b.length);
}
boolean success = false;
IndexOutput data = null;
IndexOutput index = null;
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "dat");
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, Integer.toString(field.number), "idx");
try {
data = dir.createOutput(dataName, state.context);
index = dir.createOutput(indexName, state.context);
if (minLength == maxLength) {
// fixed byte[]
addFixedSortedBytesField(field, data, index, values, docToOrd, minLength);
} else {
// var byte[]
addVarSortedBytesField(field, data, index, values, docToOrd);
}
success = true;
} finally {
if (success) {
IOUtils.close(data, index);
} else {
IOUtils.closeWhileHandlingException(data, index);
}
}
}
private void addFixedSortedBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values, Iterable<Number> docToOrd, int length) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_FIXED_SORTED.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_CURRENT);
/* values */
data.writeInt(length);
int valueCount = 0;
for (BytesRef v : values) {
data.writeBytes(v.bytes, v.offset, v.length);
valueCount++;
}
/* ordinals */
index.writeInt(valueCount);
int maxDoc = state.segmentInfo.getDocCount();
assert valueCount > 0;
final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(valueCount-1), PackedInts.DEFAULT);
for (Number n : docToOrd) {
w.add(n.longValue());
}
w.finish();
}
private void addVarSortedBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_SORTED.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);
/* values */
final long startPos = data.getFilePointer();
int valueCount = 0;
for (BytesRef v : values) {
data.writeBytes(v.bytes, v.offset, v.length);
valueCount++;
}
/* addresses */
final long maxAddress = data.getFilePointer() - startPos;
index.writeLong(maxAddress);
assert valueCount != Integer.MAX_VALUE; // unsupported by the 4.0 impl
final PackedInts.Writer w = PackedInts.getWriter(index, valueCount+1, PackedInts.bitsRequired(maxAddress), PackedInts.DEFAULT);
long currentPosition = 0;
for (BytesRef v : values) {
w.add(currentPosition);
currentPosition += v.length;
}
// write sentinel
assert currentPosition == maxAddress;
w.add(currentPosition);
w.finish();
/* ordinals */
final int maxDoc = state.segmentInfo.getDocCount();
assert valueCount > 0;
final PackedInts.Writer ords = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(valueCount-1), PackedInts.DEFAULT);
for (Number n : docToOrd) {
ords.add(n.longValue());
}
ords.finish();
}
@Override
public void close() throws IOException {

View File

@ -1,65 +0,0 @@
package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosReader.LegacyDocValuesType;
import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.BytesRef;
public class Lucene40LyingRWDocValuesFormat extends Lucene40LyingDocValuesFormat {
private final DocValuesFormat lie = new Lucene42DocValuesFormat();
// nocommit: a lie
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
final DocValuesConsumer delegate = lie.fieldsConsumer(state);
return new DocValuesConsumer() {
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
// hack: here we would examine the numerics and simulate in the impersonator the best we can
// e.g. if they are all in byte/int range write fixed, otherwise write packed or whatever
field.putAttribute(Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY, LegacyDocValuesType.VAR_INTS.name());
delegate.addNumericField(field, values);
}
@Override
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
field.putAttribute(Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY, LegacyDocValuesType.BYTES_VAR_STRAIGHT.name());
delegate.addBinaryField(field, values);
}
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
field.putAttribute(Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY, LegacyDocValuesType.BYTES_VAR_SORTED.name());
delegate.addSortedField(field, values, docToOrd);
}
@Override
public void close() throws IOException {
delegate.close();
}
};
}
}

View File

@ -33,8 +33,7 @@ public final class Lucene40RWCodec extends Lucene40Codec {
}
};
//private final DocValuesFormat docValues = new Lucene40RWDocValuesFormat();
private final DocValuesFormat docValues = new Lucene40LyingRWDocValuesFormat();
private final DocValuesFormat docValues = new Lucene40RWDocValuesFormat();
private final NormsFormat norms = new Lucene40RWNormsFormat();
@Override

View File

@ -9,7 +9,7 @@ import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosFormat;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosWriter;
import org.apache.lucene.codecs.lucene40.Lucene40LyingRWDocValuesFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWDocValuesFormat;
import org.apache.lucene.codecs.lucene40.Lucene40RWNormsFormat;
/*
@ -41,7 +41,7 @@ public class Lucene41RWCodec extends Lucene41Codec {
}
};
private final DocValuesFormat docValues = new Lucene40LyingRWDocValuesFormat();
private final DocValuesFormat docValues = new Lucene40RWDocValuesFormat();
private final NormsFormat norms = new Lucene40RWNormsFormat();
@Override