LUCENE-3217: Improved Fixed Int DocValues variants merging

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140223 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-06-27 16:06:25 +00:00
parent 1eaa40e7b4
commit 1f8995a540
6 changed files with 235 additions and 88 deletions

View File

@ -126,7 +126,7 @@ public class DefaultDocValuesProducer extends PerDocValues {
case FIXED_INTS_64:
case FIXED_INTS_8:
case VAR_INTS:
return Ints.getValues(dir, id);
return Ints.getValues(dir, id, docCount);
case FLOAT_32:
return Floats.getValues(dir, id, docCount);
case FLOAT_64:

View File

@ -66,6 +66,10 @@ abstract class IndexDocValuesArray extends Source {
maxDocID = -1;
size = 0;
}
protected abstract void writeDirect(IndexOutput out, long value) throws IOException;
protected abstract void writeDefaults(IndexOutput out, int num) throws IOException;
protected abstract void setInternal(int docId, long value);
@ -98,7 +102,7 @@ abstract class IndexDocValuesArray extends Source {
};
}
abstract ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input)
abstract ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input, int maxDoc)
throws IOException;
@Override
@ -114,9 +118,8 @@ abstract class IndexDocValuesArray extends Source {
values = new byte[0];
}
ByteValues(IndexInput input) throws IOException {
ByteValues(IndexInput input, int numDocs) throws IOException {
super(new AtomicLong(), 1, ValueType.FIXED_INTS_8);
final int numDocs = input.readInt();
values = new byte[numDocs];
adjustSize(numDocs);
input.readBytes(values, 0, values.length, false);
@ -148,18 +151,14 @@ abstract class IndexDocValuesArray extends Source {
@Override
void write(IndexOutput output, int numDocs) throws IOException {
assert maxDocID + 1 <= numDocs;
output.writeInt(numDocs);
output.writeBytes(values, 0, maxDocID + 1);
final byte zero = 0;
for (int i = maxDocID + 1; i < numDocs; i++) {
output.writeByte(zero);
}
writeDefaults(output, numDocs - (maxDocID+1));
}
@Override
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input)
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input, int maxDoc)
throws IOException {
return new FixedIntsEnumImpl(attrSource, input, type()) {
return new FixedIntsEnumImpl(attrSource, input, type(), maxDoc) {
@Override
protected void fillNext(LongsRef ref, IndexInput dataIn)
throws IOException {
@ -173,6 +172,19 @@ abstract class IndexDocValuesArray extends Source {
super.clear();
values = new byte[0];
}
@Override
protected void writeDefaults(IndexOutput out, int num) throws IOException {
final byte zero = 0;
for (int i = 0; i < num; i++) {
out.writeByte(zero);
}
}
@Override
protected void writeDirect(IndexOutput out, long value) throws IOException {
out.writeByte((byte) (0xFFL & value));
}
};
final static class ShortValues extends IndexDocValuesArray {
@ -184,10 +196,9 @@ abstract class IndexDocValuesArray extends Source {
values = new short[0];
}
ShortValues(IndexInput input) throws IOException {
ShortValues(IndexInput input, int numDocs) throws IOException {
super(new AtomicLong(), RamUsageEstimator.NUM_BYTES_SHORT,
ValueType.FIXED_INTS_16);
final int numDocs = input.readInt();
values = new short[numDocs];
adjustSize(numDocs);
for (int i = 0; i < values.length; i++) {
@ -209,7 +220,7 @@ abstract class IndexDocValuesArray extends Source {
@Override
protected void setInternal(int docId, long value) {
values[docId] = (short) (0xFFFF & value);
values[docId] = (short) (0xFFFFL & value);
}
@Override
@ -221,20 +232,16 @@ abstract class IndexDocValuesArray extends Source {
@Override
void write(IndexOutput output, int numDocs) throws IOException {
assert maxDocID + 1 <= numDocs;
output.writeInt(numDocs);
for (int i = 0; i < maxDocID + 1; i++) {
output.writeShort(values[i]);
}
final short zero = 0;
for (int i = maxDocID + 1; i < numDocs; i++) {
output.writeShort(zero);
}
writeDefaults(output, numDocs - (maxDocID+1));
}
@Override
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input)
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input, int maxDoc)
throws IOException {
return new FixedIntsEnumImpl(attrSource, input, type()) {
return new FixedIntsEnumImpl(attrSource, input, type(), maxDoc) {
@Override
protected void fillNext(LongsRef ref, IndexInput dataIn)
throws IOException {
@ -249,6 +256,19 @@ abstract class IndexDocValuesArray extends Source {
values = new short[0];
}
@Override
protected void writeDefaults(IndexOutput out, int num) throws IOException {
final short zero = 0;
for (int i = 0; i < num; i++) {
out.writeShort(zero);
}
}
@Override
protected void writeDirect(IndexOutput out, long value) throws IOException {
out.writeShort((short) (0xFFFFL & value));
}
};
final static class IntValues extends IndexDocValuesArray {
@ -259,10 +279,9 @@ abstract class IndexDocValuesArray extends Source {
values = new int[0];
}
IntValues(IndexInput input) throws IOException {
IntValues(IndexInput input, int numDocs) throws IOException {
super(new AtomicLong(), RamUsageEstimator.NUM_BYTES_INT,
ValueType.FIXED_INTS_32);
final int numDocs = input.readInt();
values = new int[numDocs];
adjustSize(numDocs);
for (int i = 0; i < values.length; i++) {
@ -296,19 +315,16 @@ abstract class IndexDocValuesArray extends Source {
@Override
void write(IndexOutput output, int numDocs) throws IOException {
assert maxDocID + 1 <= numDocs;
output.writeInt(numDocs);
for (int i = 0; i < maxDocID + 1; i++) {
output.writeInt(values[i]);
}
for (int i = maxDocID + 1; i < numDocs; i++) {
output.writeInt(0);
}
writeDefaults(output, numDocs - (maxDocID+1));
}
@Override
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input)
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input, int maxDoc)
throws IOException {
return new FixedIntsEnumImpl(attrSource, input, type()) {
return new FixedIntsEnumImpl(attrSource, input, type(), maxDoc) {
@Override
protected void fillNext(LongsRef ref, IndexInput dataIn)
throws IOException {
@ -322,6 +338,19 @@ abstract class IndexDocValuesArray extends Source {
super.clear();
values = new int[0];
}
@Override
protected void writeDefaults(IndexOutput out, int num) throws IOException {
for (int i = 0; i < num; i++) {
out.writeInt(0);
}
}
@Override
protected void writeDirect(IndexOutput out, long value) throws IOException {
out.writeInt((int) (0xFFFFFFFFL & value));
}
};
final static class LongValues extends IndexDocValuesArray {
@ -333,10 +362,9 @@ abstract class IndexDocValuesArray extends Source {
values = new long[0];
}
LongValues(IndexInput input) throws IOException {
LongValues(IndexInput input, int numDocs) throws IOException {
super(new AtomicLong(), RamUsageEstimator.NUM_BYTES_LONG,
ValueType.FIXED_INTS_64);
final int numDocs = input.readInt();
values = new long[numDocs];
adjustSize(numDocs);
for (int i = 0; i < values.length; i++) {
@ -370,20 +398,17 @@ abstract class IndexDocValuesArray extends Source {
@Override
void write(IndexOutput output, int numDocs) throws IOException {
assert maxDocID + 1 <= numDocs;
output.writeInt(numDocs);
for (int i = 0; i < maxDocID + 1; i++) {
output.writeLong(values[i]);
}
for (int i = maxDocID + 1; i < numDocs; i++) {
output.writeLong(0l);
}
writeDefaults(output, numDocs - (maxDocID+1));
}
@Override
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input)
ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input, int maxDoc)
throws IOException {
return new FixedIntsEnumImpl(attrSource, input, type()) {
return new FixedIntsEnumImpl(attrSource, input, type(), maxDoc) {
@Override
protected void fillNext(LongsRef ref, IndexInput dataIn)
throws IOException {
@ -397,6 +422,17 @@ abstract class IndexDocValuesArray extends Source {
super.clear();
values = new long[0];
}
@Override
protected void writeDefaults(IndexOutput out, int num) throws IOException {
for (int i = 0; i < num; i++) {
out.writeLong(0l);
}
}
@Override
protected void writeDirect(IndexOutput out, long value) throws IOException {
out.writeLong(value);
}
};
private abstract static class FixedIntsEnumImpl extends ValuesEnum {
@ -406,7 +442,7 @@ abstract class IndexDocValuesArray extends Source {
private int pos = -1;
private FixedIntsEnumImpl(AttributeSource source, IndexInput dataIn,
ValueType type) throws IOException {
ValueType type, int maxDoc) throws IOException {
super(source, type);
switch (type) {
case FIXED_INTS_16:
@ -427,7 +463,7 @@ abstract class IndexDocValuesArray extends Source {
}
intsRef.offset = 0;
this.dataIn = dataIn;
maxDoc = dataIn.readInt();
this.maxDoc = maxDoc;
}

View File

@ -37,7 +37,7 @@ public class Ints {
return new IntsWriter(dir, id, bytesUsed, type);
}
public static IndexDocValues getValues(Directory dir, String id) throws IOException {
return new IntsReader(dir, id);
public static IndexDocValues getValues(Directory dir, String id, int numDocs) throws IOException {
return new IntsReader(dir, id, numDocs);
}
}

View File

@ -56,7 +56,6 @@ class IntsImpl {
static class IntsWriter extends Writer {
// TODO: optimize merging here!!
private LongsRef intsRef;
private final IndexDocValuesArray array;
private long minValue;
@ -66,6 +65,8 @@ class IntsImpl {
private int lastDocId = -1;
private final Directory dir;
private final byte typeOrd;
private IndexOutput datOut;
private boolean merging;
protected IntsWriter(Directory dir, String id, AtomicLong bytesUsed,
@ -115,39 +116,116 @@ class IntsImpl {
lastDocId = docID;
array.set(docID, v);
}
private final void initDataOut(byte typeOrd) throws IOException {
if (datOut == null) {
boolean success = false;
try {
datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "",
DATA_EXTENSION));
CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT);
datOut.writeByte(typeOrd);
success = true;
} finally {
if (!success) {
IOUtils.closeSafely(true, datOut);
}
}
}
}
@Override
public void finish(int docCount) throws IOException {
IndexOutput datOut = null;
boolean success = false;
try {
datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "",
DATA_EXTENSION));
CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT);
if (!started) {
minValue = maxValue = 0;
if (datOut == null) {
// if we only add or merge Packed ints datOut is not initialized
assert !merging || typeOrd == PACKED;
finishAdd(docCount);
} else {
assert datOut != null && merging && typeOrd != PACKED;
// on merge, simply fill up missing values
fillDefault(datOut, docCount - (lastDocId + 1));
}
byte headerType = typeOrd;
if (typeOrd == PACKED) {
final long delta = maxValue - minValue;
// if we exceed the range of positive longs we must switch to fixed ints
if (delta <= ( maxValue >= 0 && minValue <= 0 ? Long.MAX_VALUE : Long.MAX_VALUE -1) && delta >= 0) {
writePackedInts(datOut, docCount);
return;
}
headerType = FIXED_64;
}
datOut.writeByte(headerType);
array.write(datOut, docCount);
success = true;
} finally {
IOUtils.closeSafely(!success, datOut);
array.clear();
}
}
private final void finishAdd(int docCount) throws IOException {
if (!started) {
minValue = maxValue = 0;
}
byte headerType = typeOrd;
if (typeOrd == PACKED) {
final long delta = maxValue - minValue;
// if we exceed the range of positive longs we must switch to fixed
// ints
if (delta <= (maxValue >= 0 && minValue <= 0 ? Long.MAX_VALUE
: Long.MAX_VALUE - 1) && delta >= 0) {
writePackedInts(docCount);
return; // done
} else {
headerType = FIXED_64;
}
}
initDataOut(headerType);
array.write(datOut, docCount);
assert datOut != null;
}
// TODO how can we improve VAR_INT mergeing here without violating compression?
@Override
protected void merge(MergeState state) throws IOException {
merging = true;
if (typeOrd != PACKED) {
initDataOut(typeOrd); // init datOut since we merge directly
if (state.bits == null && state.reader instanceof IntsReader) {
// no deleted docs - try bulk copy
final IntsReader reader = (IntsReader) state.reader;
if (reader.type == typeOrd) {
final int docBase = state.docBase;
if (docBase - lastDocId > 1) {
// fill with default values
lastDocId += fillDefault(datOut, docBase - lastDocId - 1);
}
lastDocId += reader.transferTo(datOut);
return;
}
}
}
super.merge(state);
}
@Override
protected void mergeDoc(int docID) throws IOException {
assert docID > lastDocId : "docID: " + docID
+ " must be greater than the last added doc id: " + lastDocId;
assert merging;
final long value = intsRef.get();
if (typeOrd != PACKED) {
// if now packed we do straight merging and write values directly
assert datOut != null;
if (docID - lastDocId > 1) {
// fill with default values
array.writeDefaults(datOut, docID - lastDocId - 1);
}
array.writeDirect(datOut, value);
lastDocId = docID;
} else {
add(docID, value);
}
}
protected final int fillDefault(IndexOutput datOut, int numValues) throws IOException {
array.writeDefaults(datOut, numValues);
return numValues;
}
private void writePackedInts(IndexOutput datOut, int docCount) throws IOException {
datOut.writeByte(PACKED);
private void writePackedInts(int docCount) throws IOException {
initDataOut(PACKED);
datOut.writeLong(minValue);
assert array.type() == ValueType.FIXED_INTS_64;
final long[] docToValue = (long[])array.getArray();
@ -170,11 +248,6 @@ class IntsImpl {
w.finish();
}
@Override
protected void mergeDoc(int docID) throws IOException {
add(docID, intsRef.get());
}
@Override
protected void setNextEnum(ValuesEnum valuesEnum) {
intsRef = valuesEnum.getInt();
@ -198,10 +271,12 @@ class IntsImpl {
static class IntsReader extends IndexDocValues {
private final IndexInput datIn;
private final byte type;
private final int numDocs;
protected IntsReader(Directory dir, String id) throws IOException {
protected IntsReader(Directory dir, String id, int numDocs) throws IOException {
datIn = dir.openInput(IndexFileNames.segmentFileName(id, "",
Writer.DATA_EXTENSION));
this.numDocs = numDocs;
boolean success = false;
try {
CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START);
@ -214,6 +289,21 @@ class IntsImpl {
}
}
public int transferTo(IndexOutput datOut) throws IOException {
IndexInput indexInput = (IndexInput) datIn.clone();
boolean success = false;
try {
indexInput.seek(CodecUtil.headerLength(CODEC_NAME));
// skip type
indexInput.readByte();
datOut.copyBytes(indexInput, bytesPerValue(type) * numDocs);
success = true;
} finally {
IOUtils.closeSafely(!success, indexInput);
}
return numDocs;
}
/**
* Loads the actual values. You may call this more than once, eg if you
* already previously loaded but then discarded the Source.
@ -226,7 +316,7 @@ class IntsImpl {
try {
input = (IndexInput) datIn.clone();
input.seek(CodecUtil.headerLength(CODEC_NAME) + 1);
source = loadFixedSource(type, input);
source = loadFixedSource(type, input, numDocs);
success = true;
return source;
} finally {
@ -248,7 +338,7 @@ class IntsImpl {
boolean success = false;
try {
input.seek(CodecUtil.headerLength(CODEC_NAME) + 1);
final ValuesEnum inst = directEnum(type, source, input);
final ValuesEnum inst = directEnum(type, source, input, numDocs);
success = true;
return inst;
} finally {
@ -264,16 +354,16 @@ class IntsImpl {
}
}
private static ValuesEnum directEnum(byte ord, AttributeSource attrSource, IndexInput input) throws IOException {
private static ValuesEnum directEnum(byte ord, AttributeSource attrSource, IndexInput input, int numDocs) throws IOException {
switch (ord) {
case FIXED_16:
return new ShortValues((AtomicLong)null).getDirectEnum(attrSource, input);
return new ShortValues((AtomicLong)null).getDirectEnum(attrSource, input, numDocs);
case FIXED_32:
return new IntValues((AtomicLong)null).getDirectEnum(attrSource, input);
return new IntValues((AtomicLong)null).getDirectEnum(attrSource, input, numDocs);
case FIXED_64:
return new LongValues((AtomicLong)null).getDirectEnum(attrSource, input);
return new LongValues((AtomicLong)null).getDirectEnum(attrSource, input, numDocs);
case FIXED_8:
return new ByteValues((AtomicLong)null).getDirectEnum(attrSource, input);
return new ByteValues((AtomicLong)null).getDirectEnum(attrSource, input, numDocs);
case PACKED:
return new PackedIntsEnumImpl(attrSource, input);
default:
@ -281,16 +371,16 @@ class IntsImpl {
}
}
private static IndexDocValues.Source loadFixedSource(byte ord, IndexInput input) throws IOException {
private static IndexDocValues.Source loadFixedSource(byte ord, IndexInput input, int numDoc) throws IOException {
switch (ord) {
case FIXED_16:
return new ShortValues(input);
return new ShortValues(input, numDoc);
case FIXED_32:
return new IntValues(input);
return new IntValues(input, numDoc);
case FIXED_64:
return new LongValues(input);
return new LongValues(input, numDoc);
case FIXED_8:
return new ByteValues(input);
return new ByteValues(input, numDoc);
case PACKED:
return new PackedIntsSource(input);
default:
@ -298,6 +388,27 @@ class IntsImpl {
}
}
private static int bytesPerValue(byte typeOrd) {
final int numBytes;
switch (typeOrd) {
case FIXED_16:
numBytes = 2;
break;
case FIXED_32:
numBytes = 4;
break;
case FIXED_64:
numBytes = 8;
break;
case FIXED_8:
numBytes = 1;
break;
default:
throw new IllegalStateException("illegal type ord " + typeOrd);
}
return numBytes;
}
static class PackedIntsSource extends Source {
private final long minValue;
private final long defaultValue;

View File

@ -187,7 +187,7 @@ public class TestDocValues extends LuceneTestCase {
w.add(1, minMax[i][1]);
w.finish(2);
assertEquals(0, trackBytes.get());
IndexDocValues r = Ints.getValues(dir, "test");
IndexDocValues r = Ints.getValues(dir, "test", 2);
Source source = getSource(r);
assertEquals(i + " with min: " + minMax[i][0] + " max: " + minMax[i][1],
expectedTypes[i], source.type());
@ -228,7 +228,7 @@ public class TestDocValues extends LuceneTestCase {
w.add(i, (long) sourceArray[i]);
}
w.finish(sourceArray.length);
IndexDocValues r = Ints.getValues(dir, "test");
IndexDocValues r = Ints.getValues(dir, "test", sourceArray.length);
Source source = r.getSource();
assertTrue(source.hasArray());
byte[] loaded = ((byte[])source.getArray());
@ -249,7 +249,7 @@ public class TestDocValues extends LuceneTestCase {
w.add(i, (long) sourceArray[i]);
}
w.finish(sourceArray.length);
IndexDocValues r = Ints.getValues(dir, "test");
IndexDocValues r = Ints.getValues(dir, "test", sourceArray.length);
Source source = r.getSource();
assertTrue(source.hasArray());
short[] loaded = ((short[])source.getArray());
@ -270,7 +270,7 @@ public class TestDocValues extends LuceneTestCase {
w.add(i, sourceArray[i]);
}
w.finish(sourceArray.length);
IndexDocValues r = Ints.getValues(dir, "test");
IndexDocValues r = Ints.getValues(dir, "test", sourceArray.length);
Source source = r.getSource();
assertTrue(source.hasArray());
long[] loaded = ((long[])source.getArray());
@ -291,7 +291,7 @@ public class TestDocValues extends LuceneTestCase {
w.add(i, (long) sourceArray[i]);
}
w.finish(sourceArray.length);
IndexDocValues r = Ints.getValues(dir, "test");
IndexDocValues r = Ints.getValues(dir, "test", sourceArray.length);
Source source = r.getSource();
assertTrue(source.hasArray());
int[] loaded = ((int[])source.getArray());
@ -362,7 +362,7 @@ public class TestDocValues extends LuceneTestCase {
w.finish(NUM_VALUES + additionalDocs);
assertEquals(0, trackBytes.get());
IndexDocValues r = Ints.getValues(dir, "test");
IndexDocValues r = Ints.getValues(dir, "test", NUM_VALUES + additionalDocs);
for (int iter = 0; iter < 2; iter++) {
Source s = getSource(r);
assertEquals(type, s.type());

View File

@ -271,7 +271,7 @@ public class TestDocValuesIndexing extends LuceneTestCase {
}
assertEquals("advance failed at index: " + i + " of " + r.numDocs()
+ " docs", i, intsEnum.advance(i));
assertEquals(val + "" + mod + " " + i, expected%mod, ints.getInt(i));
assertEquals(val + " mod: " + mod + " index: " + i, expected%mod, ints.getInt(i));
assertEquals(expected%mod, enumRef.get());
}