From 22f37cf1f10153906a90cebdc7f9091fabbd5d76 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 27 Jun 2011 08:21:22 +0000 Subject: [PATCH] LUCENE-3231: Add fixed size DocValues int variants & expose Arrays where possible git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140047 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/document/IndexDocValuesField.java | 93 +++- .../org/apache/lucene/index/CheckIndex.java | 6 +- .../org/apache/lucene/index/FieldInfos.java | 30 +- .../codecs/DefaultDocValuesConsumer.java | 7 +- .../codecs/DefaultDocValuesProducer.java | 8 +- .../apache/lucene/index/values/Floats.java | 28 +- .../lucene/index/values/IndexDocValues.java | 19 + .../index/values/IndexDocValuesArray.java | 470 ++++++++++++++++++ .../org/apache/lucene/index/values/Ints.java | 9 +- .../apache/lucene/index/values/IntsImpl.java | 361 ++++++-------- .../apache/lucene/index/values/ValueType.java | 70 ++- .../lucene/index/values/ValuesEnum.java | 7 +- .../apache/lucene/index/values/Writer.java | 9 +- .../function/NumericIndexDocValueSource.java | 2 +- .../lucene/index/RandomIndexWriter.java | 14 +- .../lucene/index/values/TestDocValues.java | 297 ++++++++--- .../index/values/TestDocValuesIndexing.java | 83 ++-- .../org/apache/lucene/search/TestSort.java | 2 +- 18 files changed, 1178 insertions(+), 337 deletions(-) create mode 100644 lucene/src/java/org/apache/lucene/index/values/IndexDocValuesArray.java diff --git a/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java b/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java index 997cde24501..e7984af610b 100644 --- a/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java +++ b/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java @@ -98,12 +98,99 @@ public class IndexDocValuesField extends AbstractField implements PerDocFieldVal /** * Sets the given long value and sets the field's {@link ValueType} to - * {@link ValueType#INTS} unless already set. If you want to change the + * {@link ValueType#VAR_INTS} unless already set. If you want to change the * default type use {@link #setType(ValueType)}. */ public void setInt(long value) { + setInt(value, false); + } + + /** + * Sets the given long value as a 64 bit signed integer. + * + * @param value + * the value to set + * @param fixed + * if true {@link ValueType#FIXED_INTS_64} is used + * otherwise {@link ValueType#VAR_INTS} + */ + public void setInt(long value, boolean fixed) { if (type == null) { - type = ValueType.INTS; + type = fixed ? ValueType.FIXED_INTS_64 : ValueType.VAR_INTS; + } + longValue = value; + } + + /** + * Sets the given int value and sets the field's {@link ValueType} to + * {@link ValueType#VAR_INTS} unless already set. If you want to change the + * default type use {@link #setType(ValueType)}. + */ + public void setInt(int value) { + setInt(value, false); + } + + /** + * Sets the given int value as a 32 bit signed integer. + * + * @param value + * the value to set + * @param fixed + * if true {@link ValueType#FIXED_INTS_32} is used + * otherwise {@link ValueType#VAR_INTS} + */ + public void setInt(int value, boolean fixed) { + if (type == null) { + type = fixed ? ValueType.FIXED_INTS_32 : ValueType.VAR_INTS; + } + longValue = value; + } + + /** + * Sets the given short value and sets the field's {@link ValueType} to + * {@link ValueType#VAR_INTS} unless already set. If you want to change the + * default type use {@link #setType(ValueType)}. + */ + public void setInt(short value) { + setInt(value, false); + } + + /** + * Sets the given short value as a 16 bit signed integer. + * + * @param value + * the value to set + * @param fixed + * if true {@link ValueType#FIXED_INTS_16} is used + * otherwise {@link ValueType#VAR_INTS} + */ + public void setInt(short value, boolean fixed) { + if (type == null) { + type = fixed ? ValueType.FIXED_INTS_16 : ValueType.VAR_INTS; + } + longValue = value; + } + + /** + * Sets the given byte value and sets the field's {@link ValueType} to + * {@link ValueType#VAR_INTS} unless already set. If you want to change the + * default type use {@link #setType(ValueType)}. + */ + public void setInt(byte value) { + setInt(value, false); + } + /** + * Sets the given byte value as a 8 bit signed integer. + * + * @param value + * the value to set + * @param fixed + * if true {@link ValueType#FIXED_INTS_8} is used + * otherwise {@link ValueType#VAR_INTS} + */ + public void setInt(byte value, boolean fixed) { + if (type == null) { + type = fixed ? ValueType.FIXED_INTS_8 : ValueType.VAR_INTS; } longValue = value; } @@ -268,7 +355,7 @@ public class IndexDocValuesField extends AbstractField implements PerDocFieldVal field.stringValue()); valField.setBytes(ref, type); break; - case INTS: + case VAR_INTS: valField.setInt(Long.parseLong(field.stringValue())); break; case FLOAT_32: diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index fd210992501..ff92206b3bf 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -1025,7 +1025,11 @@ public class CheckIndex { case FLOAT_64: values.getFloat(); break; - case INTS: + case VAR_INTS: + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case FIXED_INTS_8: values.getInt(); break; default: diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java index f3bf5a12681..5e1ddea0fee 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java @@ -609,7 +609,7 @@ public final class FieldInfos implements Iterable { b = 0; } else { switch(fi.docValues) { - case INTS: + case VAR_INTS: b = 1; break; case FLOAT_32: @@ -636,6 +636,19 @@ public final class FieldInfos implements Iterable { case BYTES_VAR_SORTED: b = 9; break; + case FIXED_INTS_16: + b = 10; + break; + case FIXED_INTS_32: + b = 11; + break; + case FIXED_INTS_64: + b = 12; + break; + case FIXED_INTS_8: + b = 13; + break; + default: throw new IllegalStateException("unhandled indexValues type " + fi.docValues); } @@ -686,7 +699,7 @@ public final class FieldInfos implements Iterable { docValuesType = null; break; case 1: - docValuesType = ValueType.INTS; + docValuesType = ValueType.VAR_INTS; break; case 2: docValuesType = ValueType.FLOAT_32; @@ -712,6 +725,19 @@ public final class FieldInfos implements Iterable { case 9: docValuesType = ValueType.BYTES_VAR_SORTED; break; + case 10: + docValuesType = ValueType.FIXED_INTS_16; + break; + case 11: + docValuesType = ValueType.FIXED_INTS_32; + break; + case 12: + docValuesType = ValueType.FIXED_INTS_64; + break; + case 13: + docValuesType = ValueType.FIXED_INTS_8; + break; + default: throw new IllegalStateException("unhandled indexValues type " + b); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java index 9edadc467df..5569c583570 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java @@ -80,12 +80,17 @@ public class DefaultDocValuesConsumer extends PerDocConsumer { case BYTES_FIXED_STRAIGHT: case FLOAT_32: case FLOAT_64: - case INTS: + case VAR_INTS: + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case FIXED_INTS_8: files.add(IndexFileNames.segmentFileName(filename, "", Writer.DATA_EXTENSION)); assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", Writer.DATA_EXTENSION)); break; + default: assert false; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java index c00f54fbbd2..19fe8bd395f 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java @@ -121,8 +121,12 @@ public class DefaultDocValuesProducer extends PerDocValues { protected IndexDocValues loadDocValues(int docCount, Directory dir, String id, ValueType type) throws IOException { switch (type) { - case INTS: - return Ints.getValues(dir, id, false); + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case FIXED_INTS_8: + case VAR_INTS: + return Ints.getValues(dir, id); case FLOAT_32: return Floats.getValues(dir, id, docCount); case FLOAT_64: diff --git a/lucene/src/java/org/apache/lucene/index/values/Floats.java b/lucene/src/java/org/apache/lucene/index/values/Floats.java index 7fe03b29161..ae5287b9fb3 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Floats.java +++ b/lucene/src/java/org/apache/lucene/index/values/Floats.java @@ -156,7 +156,7 @@ public class Floats { } // Writes 4 bytes (float) per value - static class Float4Writer extends FloatsWriter { + static final class Float4Writer extends FloatsWriter { private int[] values; protected Float4Writer(Directory dir, String id, AtomicLong bytesUsed) throws IOException { @@ -219,7 +219,7 @@ public class Floats { } // Writes 8 bytes (double) per value - static class Float8Writer extends FloatsWriter { + static final class Float8Writer extends FloatsWriter { private long[] values; protected Float8Writer(Directory dir, String id, AtomicLong bytesUsed) throws IOException { @@ -341,7 +341,7 @@ public class Floats { } } - private class Source4 extends Source { + private final class Source4 extends Source { private final float[] values; Source4(final float[] values ) throws IOException { @@ -367,13 +367,23 @@ public class Floats { }; } + @Override + public Object getArray() { + return this.values; + } + + @Override + public boolean hasArray() { + return true; + } + @Override public ValueType type() { return ValueType.FLOAT_32; } } - private class Source8 extends Source { + private final class Source8 extends Source { private final double[] values; Source8(final double[] values) throws IOException { @@ -403,6 +413,16 @@ public class Floats { public ValueType type() { return ValueType.FLOAT_64; } + + @Override + public Object getArray() { + return this.values; + } + + @Override + public boolean hasArray() { + return true; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java b/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java index 32885679f24..e9bde3fb3c1 100644 --- a/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java +++ b/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java @@ -254,6 +254,25 @@ public abstract class IndexDocValues implements Closeable { */ public abstract ValuesEnum getEnum(AttributeSource attrSource) throws IOException; + + /** + * Returns true iff this {@link Source} exposes an array via + * {@link #getArray()} otherwise false. + * + * @return true iff this {@link Source} exposes an array via + * {@link #getArray()} otherwise false. + */ + public boolean hasArray() { + return false; + } + + /** + * Returns the internal array representation iff this {@link Source} uses an + * array as its inner representation, otherwise null. + */ + public Object getArray() { + return null; + } } /** diff --git a/lucene/src/java/org/apache/lucene/index/values/IndexDocValuesArray.java b/lucene/src/java/org/apache/lucene/index/values/IndexDocValuesArray.java new file mode 100644 index 00000000000..af77371493c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/values/IndexDocValuesArray.java @@ -0,0 +1,470 @@ +package org.apache.lucene.index.values; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.index.values.IndexDocValues.SourceEnum; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +/** + * @lucene.experimental + */ +abstract class IndexDocValuesArray extends Source { + + private final AtomicLong bytesUsed; + private final int bytesPerValue; + private int size = 0; + private final ValueType type; + protected int maxDocID = -1; + + IndexDocValuesArray(AtomicLong bytesUsed, int bytesPerValue, ValueType type) { + this.bytesUsed = bytesUsed; + this.bytesPerValue = bytesPerValue; + this.type = type; + } + + void set(int docId, long value) { + if (docId >= size) { + adjustSize(grow(docId + 1)); + } + if (docId > maxDocID) { + maxDocID = docId; + } + setInternal(docId, value); + } + + protected final void adjustSize(int newSize) { + bytesUsed.addAndGet(bytesPerValue * (newSize - size)); + size = newSize; + } + + void clear() { + adjustSize(0); + maxDocID = -1; + size = 0; + } + + protected abstract void setInternal(int docId, long value); + + protected abstract int grow(int numDocs); + + abstract void write(IndexOutput output, int numDocs) throws IOException; + + @Override + public final int getValueCount() { + return maxDocID + 1; + } + + @Override + public final ValueType type() { + return type; + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + return new SourceEnum(attrSource, type(), this, maxDocID + 1) { + + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) { + return pos = NO_MORE_DOCS; + } + intsRef.ints[intsRef.offset] = IndexDocValuesArray.this.getInt(target); + return pos = target; + } + }; + } + + abstract ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input) + throws IOException; + + @Override + public final boolean hasArray() { + return true; + } + + final static class ByteValues extends IndexDocValuesArray { + private byte[] values; + + ByteValues(AtomicLong bytesUsed) { + super(bytesUsed, 1, ValueType.FIXED_INTS_8); + values = new byte[0]; + } + + ByteValues(IndexInput input) throws IOException { + super(new AtomicLong(), 1, ValueType.FIXED_INTS_8); + final int numDocs = input.readInt(); + values = new byte[numDocs]; + adjustSize(numDocs); + input.readBytes(values, 0, values.length, false); + maxDocID = numDocs - 1; + } + + @Override + public byte[] getArray() { + return values; + } + + @Override + public long getInt(int docID) { + assert docID >= 0 && docID < values.length; + return values[docID]; + } + + @Override + protected void setInternal(int docId, long value) { + values[docId] = (byte) (0xFFL & value); + } + + @Override + protected int grow(int numDocs) { + values = ArrayUtil.grow(values, numDocs); + return values.length; + } + + @Override + void write(IndexOutput output, int numDocs) throws IOException { + assert maxDocID + 1 <= numDocs; + output.writeInt(numDocs); + output.writeBytes(values, 0, maxDocID + 1); + final byte zero = 0; + for (int i = maxDocID + 1; i < numDocs; i++) { + output.writeByte(zero); + } + } + + @Override + ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input) + throws IOException { + return new FixedIntsEnumImpl(attrSource, input, type()) { + @Override + protected void fillNext(LongsRef ref, IndexInput dataIn) + throws IOException { + ref.ints[ref.offset] = dataIn.readByte(); + } + }; + } + + @Override + void clear() { + super.clear(); + values = new byte[0]; + } + }; + + final static class ShortValues extends IndexDocValuesArray { + private short[] values; + + ShortValues(AtomicLong bytesUsed) { + super(bytesUsed, RamUsageEstimator.NUM_BYTES_SHORT, + ValueType.FIXED_INTS_16); + values = new short[0]; + } + + ShortValues(IndexInput input) throws IOException { + super(new AtomicLong(), RamUsageEstimator.NUM_BYTES_SHORT, + ValueType.FIXED_INTS_16); + final int numDocs = input.readInt(); + values = new short[numDocs]; + adjustSize(numDocs); + for (int i = 0; i < values.length; i++) { + values[i] = input.readShort(); + } + maxDocID = numDocs - 1; + } + + @Override + public short[] getArray() { + return values; + } + + @Override + public long getInt(int docID) { + assert docID >= 0 && docID < values.length; + return values[docID]; + } + + @Override + protected void setInternal(int docId, long value) { + values[docId] = (short) (0xFFFF & value); + } + + @Override + protected int grow(int numDocs) { + values = ArrayUtil.grow(values, numDocs); + return values.length; + } + + @Override + void write(IndexOutput output, int numDocs) throws IOException { + assert maxDocID + 1 <= numDocs; + output.writeInt(numDocs); + for (int i = 0; i < maxDocID + 1; i++) { + output.writeShort(values[i]); + } + final short zero = 0; + for (int i = maxDocID + 1; i < numDocs; i++) { + output.writeShort(zero); + } + } + + @Override + ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input) + throws IOException { + return new FixedIntsEnumImpl(attrSource, input, type()) { + @Override + protected void fillNext(LongsRef ref, IndexInput dataIn) + throws IOException { + ref.ints[ref.offset] = dataIn.readShort(); + } + }; + } + + @Override + void clear() { + super.clear(); + values = new short[0]; + } + + }; + + final static class IntValues extends IndexDocValuesArray { + private int[] values; + + IntValues(AtomicLong bytesUsed) { + super(bytesUsed, RamUsageEstimator.NUM_BYTES_INT, ValueType.FIXED_INTS_32); + values = new int[0]; + } + + IntValues(IndexInput input) throws IOException { + super(new AtomicLong(), RamUsageEstimator.NUM_BYTES_INT, + ValueType.FIXED_INTS_32); + final int numDocs = input.readInt(); + values = new int[numDocs]; + adjustSize(numDocs); + for (int i = 0; i < values.length; i++) { + values[i] = input.readInt(); + } + maxDocID = numDocs - 1; + } + + @Override + public int[] getArray() { + return values; + } + + @Override + public long getInt(int docID) { + assert docID >= 0 && docID < values.length; + return 0xFFFFFFFF & values[docID]; + } + + @Override + protected void setInternal(int docId, long value) { + values[docId] = (int) (0xFFFFFFFF & value); + } + + @Override + protected int grow(int numDocs) { + values = ArrayUtil.grow(values, numDocs); + return values.length; + } + + @Override + void write(IndexOutput output, int numDocs) throws IOException { + assert maxDocID + 1 <= numDocs; + output.writeInt(numDocs); + for (int i = 0; i < maxDocID + 1; i++) { + output.writeInt(values[i]); + } + for (int i = maxDocID + 1; i < numDocs; i++) { + output.writeInt(0); + } + } + + @Override + ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input) + throws IOException { + return new FixedIntsEnumImpl(attrSource, input, type()) { + @Override + protected void fillNext(LongsRef ref, IndexInput dataIn) + throws IOException { + ref.ints[ref.offset] = dataIn.readInt(); + } + }; + } + + @Override + void clear() { + super.clear(); + values = new int[0]; + } + }; + + final static class LongValues extends IndexDocValuesArray { + private long[] values; + + LongValues(AtomicLong bytesUsed) { + super(bytesUsed, RamUsageEstimator.NUM_BYTES_LONG, + ValueType.FIXED_INTS_64); + values = new long[0]; + } + + LongValues(IndexInput input) throws IOException { + super(new AtomicLong(), RamUsageEstimator.NUM_BYTES_LONG, + ValueType.FIXED_INTS_64); + final int numDocs = input.readInt(); + values = new long[numDocs]; + adjustSize(numDocs); + for (int i = 0; i < values.length; i++) { + values[i] = input.readLong(); + } + maxDocID = numDocs - 1; + } + + @Override + public long[] getArray() { + return values; + } + + @Override + public long getInt(int docID) { + assert docID >= 0 && docID < values.length; + return values[docID]; + } + + @Override + protected void setInternal(int docId, long value) { + values[docId] = value; + } + + @Override + protected int grow(int numDocs) { + values = ArrayUtil.grow(values, numDocs); + return values.length; + } + + @Override + void write(IndexOutput output, int numDocs) throws IOException { + assert maxDocID + 1 <= numDocs; + output.writeInt(numDocs); + for (int i = 0; i < maxDocID + 1; i++) { + output.writeLong(values[i]); + } + + for (int i = maxDocID + 1; i < numDocs; i++) { + output.writeLong(0l); + } + } + + @Override + ValuesEnum getDirectEnum(AttributeSource attrSource, IndexInput input) + throws IOException { + return new FixedIntsEnumImpl(attrSource, input, type()) { + @Override + protected void fillNext(LongsRef ref, IndexInput dataIn) + throws IOException { + ref.ints[ref.offset] = dataIn.readLong(); + } + }; + } + + @Override + void clear() { + super.clear(); + values = new long[0]; + } + }; + + private abstract static class FixedIntsEnumImpl extends ValuesEnum { + private final IndexInput dataIn; + private final int maxDoc; + private final int sizeInByte; + private int pos = -1; + + private FixedIntsEnumImpl(AttributeSource source, IndexInput dataIn, + ValueType type) throws IOException { + super(source, type); + switch (type) { + case FIXED_INTS_16: + sizeInByte = 2; + break; + case FIXED_INTS_32: + sizeInByte = 4; + break; + case FIXED_INTS_64: + sizeInByte = 8; + break; + case FIXED_INTS_8: + sizeInByte = 1; + break; + default: + throw new IllegalStateException("type " + type + + " is not a fixed int type"); + } + intsRef.offset = 0; + this.dataIn = dataIn; + maxDoc = dataIn.readInt(); + + } + + @Override + public void close() throws IOException { + dataIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return pos = NO_MORE_DOCS; + } + assert target > pos; + if (target > pos + 1) { + dataIn + .seek(dataIn.getFilePointer() + ((target - pos - 1) * sizeInByte)); + } + fillNext(intsRef, dataIn); + return pos = target; + } + + protected abstract void fillNext(LongsRef ref, IndexInput input) + throws IOException; + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= maxDoc) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + } + +} diff --git a/lucene/src/java/org/apache/lucene/index/values/Ints.java b/lucene/src/java/org/apache/lucene/index/values/Ints.java index d3cf1039538..c5f18c6466e 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Ints.java +++ b/lucene/src/java/org/apache/lucene/index/values/Ints.java @@ -33,14 +33,11 @@ public class Ints { private Ints() { } - public static Writer getWriter(Directory dir, String id, - boolean useFixedArray, AtomicLong bytesUsed) throws IOException { - // TODO - implement fixed?! - return new IntsWriter(dir, id, bytesUsed); + public static Writer getWriter(Directory dir, String id, AtomicLong bytesUsed, ValueType type) throws IOException { + return new IntsWriter(dir, id, bytesUsed, type); } - public static IndexDocValues getValues(Directory dir, String id, - boolean useFixedArray) throws IOException { + public static IndexDocValues getValues(Directory dir, String id) throws IOException { return new IntsReader(dir, id); } } diff --git a/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java b/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java index 4921f4b777d..a525df0e35a 100644 --- a/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java +++ b/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java @@ -21,19 +21,24 @@ import java.util.Collection; import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.values.IndexDocValuesArray; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.index.values.IndexDocValues.SourceEnum; +import org.apache.lucene.index.values.IndexDocValuesArray.ByteValues; +import org.apache.lucene.index.values.IndexDocValuesArray.IntValues; +import org.apache.lucene.index.values.IndexDocValuesArray.LongValues; +import org.apache.lucene.index.values.IndexDocValuesArray.ShortValues; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LongsRef; -import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.PackedInts; /** - * Stores ints packed with fixed-bit precision. + * Stores ints packed and fixed with fixed-bit precision. * * @lucene.experimental * */ @@ -41,45 +46,59 @@ class IntsImpl { private static final String CODEC_NAME = "Ints"; private static final byte PACKED = 0x00; - private static final byte FIXED = 0x01; - + private static final byte FIXED_64 = 0x01; + private static final byte FIXED_32 = 0x02; + private static final byte FIXED_16 = 0x03; + private static final byte FIXED_8 = 0x04; + static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; static class IntsWriter extends Writer { - // TODO: can we bulkcopy this on a merge? + // TODO: optimize merging here!! private LongsRef intsRef; - private long[] docToValue; + private final IndexDocValuesArray array; private long minValue; private long maxValue; private boolean started; private final String id; private int lastDocId = -1; - private IndexOutput datOut; + private final Directory dir; + private final byte typeOrd; + - protected IntsWriter(Directory dir, String id, AtomicLong bytesUsed) - throws IOException { + protected IntsWriter(Directory dir, String id, AtomicLong bytesUsed, + ValueType valueType) throws IOException { super(bytesUsed); - datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", - DATA_EXTENSION)); - boolean success = false; - try { - CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); - this.id = id; - docToValue = new long[1]; - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG); // TODO the - // bitset - // needs memory - // too - success = true; - } finally { - if (!success) { - datOut.close(); - } + this.dir = dir; + this.id = id; + switch (valueType) { + case FIXED_INTS_16: + array= new ShortValues(bytesUsed); + typeOrd = FIXED_16; + break; + case FIXED_INTS_32: + array = new IntValues(bytesUsed); + typeOrd = FIXED_32; + break; + case FIXED_INTS_64: + array = new LongValues(bytesUsed); + typeOrd = FIXED_64; + break; + case FIXED_INTS_8: + array = new ByteValues(bytesUsed); + typeOrd = FIXED_8; + break; + case VAR_INTS: + array = new LongValues(bytesUsed); + typeOrd = PACKED; + break; + default: + throw new IllegalStateException("unknown type " + valueType); } } - + @Override public void add(int docID, long v) throws IOException { assert lastDocId < docID; @@ -94,65 +113,60 @@ class IntsImpl { } } lastDocId = docID; - - if (docID >= docToValue.length) { - final long len = docToValue.length; - docToValue = ArrayUtil.grow(docToValue, 1 + docID); - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG - * ((docToValue.length) - len)); - } - docToValue[docID] = v; + array.set(docID, v); } @Override public void finish(int docCount) throws IOException { + IndexOutput datOut = null; + boolean success = false; try { + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + DATA_EXTENSION)); + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); if (!started) { minValue = maxValue = 0; } - // if we exceed the range of positive longs we must switch to fixed ints - if ((maxValue - minValue) < (((long)1) << 63) && (maxValue - minValue) >= 0) { - writePackedInts(docCount); - } else { - writeFixedInts(docCount); + byte headerType = typeOrd; + if (typeOrd == PACKED) { + final long delta = maxValue - minValue; + // if we exceed the range of positive longs we must switch to fixed ints + if (delta <= ( maxValue >= 0 && minValue <= 0 ? Long.MAX_VALUE : Long.MAX_VALUE -1) && delta >= 0) { + writePackedInts(datOut, docCount); + return; + } + headerType = FIXED_64; } - + datOut.writeByte(headerType); + array.write(datOut, docCount); + success = true; } finally { - datOut.close(); - bytesUsed - .addAndGet(-(RamUsageEstimator.NUM_BYTES_LONG * docToValue.length)); - docToValue = null; + IOUtils.closeSafely(!success, datOut); + array.clear(); } } - private void writeFixedInts(int docCount) throws IOException { - datOut.writeByte(FIXED); - datOut.writeInt(docCount); - for (int i = 0; i < docToValue.length; i++) { - datOut.writeLong(docToValue[i]); // write full array - we use 0 as default - } - for (int i = docToValue.length; i < docCount; i++) { - datOut.writeLong(0); // fill with defaults values - } - } - - private void writePackedInts(int docCount) throws IOException { + private void writePackedInts(IndexOutput datOut, int docCount) throws IOException { datOut.writeByte(PACKED); datOut.writeLong(minValue); + assert array.type() == ValueType.FIXED_INTS_64; + final long[] docToValue = (long[])array.getArray(); // write a default value to recognize docs without a value for that // field - final long defaultValue = maxValue>= 0 && minValue <=0 ? 0-minValue : ++maxValue-minValue; + final long defaultValue = maxValue >= 0 && minValue <= 0 ? 0 - minValue + : ++maxValue - minValue; datOut.writeLong(defaultValue); PackedInts.Writer w = PackedInts.getWriter(datOut, docCount, - PackedInts.bitsRequired(maxValue-minValue)); - final int limit = docToValue.length > docCount ? docCount : docToValue.length; + PackedInts.bitsRequired(maxValue - minValue)); + final int limit = docToValue.length > docCount ? docCount + : docToValue.length; for (int i = 0; i < limit; i++) { w.add(docToValue[i] == 0 ? defaultValue : docToValue[i] - minValue); } for (int i = limit; i < docCount; i++) { w.add(defaultValue); } - + w.finish(); } @@ -183,7 +197,7 @@ class IntsImpl { */ static class IntsReader extends IndexDocValues { private final IndexInput datIn; - private final boolean packed; + private final byte type; protected IntsReader(Directory dir, String id) throws IOException { datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", @@ -191,7 +205,7 @@ class IntsImpl { boolean success = false; try { CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START); - packed = PACKED == datIn.readByte(); + type = datIn.readByte(); success = true; } finally { if (!success) { @@ -206,100 +220,21 @@ class IntsImpl { */ @Override public Source load() throws IOException { - final IndexInput input = (IndexInput) datIn.clone(); boolean success = false; + final Source source; + IndexInput input = null; try { - final Source source = packed ? new PackedIntsSource(input) - : new FixedIntsSource(input); + input = (IndexInput) datIn.clone(); + input.seek(CodecUtil.headerLength(CODEC_NAME) + 1); + source = loadFixedSource(type, input); success = true; return source; } finally { if (!success) { - IOUtils.closeSafely(true, datIn); + IOUtils.closeSafely(true, input, datIn); } } } - - private static class FixedIntsSource extends Source { - private final long[] values; - public FixedIntsSource(IndexInput dataIn) throws IOException { - dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); - final int numDocs = dataIn.readInt(); - values = new long[numDocs]; - for (int i = 0; i < values.length; i++) { - values[i] = dataIn.readLong(); - } - } - - @Override - public long getInt(int docID) { - assert docID >= 0 && docID < values.length; - return values[docID]; - } - - @Override - public ValueType type() { - return ValueType.INTS; - } - - @Override - public ValuesEnum getEnum(AttributeSource attrSource) - throws IOException { - return new SourceEnum(attrSource, type(), this, values.length) { - - @Override - public int advance(int target) throws IOException { - if (target >= numDocs) - return pos = NO_MORE_DOCS; - intsRef.ints[intsRef.offset] = values[target]; - return pos = target; - } - }; - } - - } - - private static class PackedIntsSource extends Source { - private final long minValue; - private final long defaultValue; - private final PackedInts.Reader values; - - public PackedIntsSource(IndexInput dataIn) throws IOException { - dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); - minValue = dataIn.readLong(); - defaultValue = dataIn.readLong(); - values = PackedInts.getReader(dataIn); - } - - @Override - public long getInt(int docID) { - // TODO -- can we somehow avoid 2X method calls - // on each get? must push minValue down, and make - // PackedInts implement Ints.Source - assert docID >= 0; - final long value = values.get(docID); - return value == defaultValue ? 0 : minValue + value; - } - - @Override - public ValuesEnum getEnum(AttributeSource attrSource) - throws IOException { - return new SourceEnum(attrSource, type(), this, values.size()) { - @Override - public int advance(int target) throws IOException { - if (target >= numDocs) - return pos = NO_MORE_DOCS; - intsRef.ints[intsRef.offset] = source.getInt(target); - return pos = target; - } - }; - } - - @Override - public ValueType type() { - return ValueType.INTS; - } - } @Override public void close() throws IOException { @@ -312,8 +247,8 @@ class IntsImpl { final IndexInput input = (IndexInput) datIn.clone(); boolean success = false; try { - ValuesEnum inst = packed ? new PackedIntsEnumImpl(source, input) - : new FixedIntsEnumImpl(source, input); + input.seek(CodecUtil.headerLength(CODEC_NAME) + 1); + final ValuesEnum inst = directEnum(type, source, input); success = true; return inst; } finally { @@ -325,11 +260,86 @@ class IntsImpl { @Override public ValueType type() { - return ValueType.INTS; + return ValueType.VAR_INTS; + } + } + + private static ValuesEnum directEnum(byte ord, AttributeSource attrSource, IndexInput input) throws IOException { + switch (ord) { + case FIXED_16: + return new ShortValues((AtomicLong)null).getDirectEnum(attrSource, input); + case FIXED_32: + return new IntValues((AtomicLong)null).getDirectEnum(attrSource, input); + case FIXED_64: + return new LongValues((AtomicLong)null).getDirectEnum(attrSource, input); + case FIXED_8: + return new ByteValues((AtomicLong)null).getDirectEnum(attrSource, input); + case PACKED: + return new PackedIntsEnumImpl(attrSource, input); + default: + throw new IllegalStateException("unknown type ordinal " + ord); + } + } + + private static IndexDocValues.Source loadFixedSource(byte ord, IndexInput input) throws IOException { + switch (ord) { + case FIXED_16: + return new ShortValues(input); + case FIXED_32: + return new IntValues(input); + case FIXED_64: + return new LongValues(input); + case FIXED_8: + return new ByteValues(input); + case PACKED: + return new PackedIntsSource(input); + default: + throw new IllegalStateException("unknown type ordinal " + ord); + } + } + + static class PackedIntsSource extends Source { + private final long minValue; + private final long defaultValue; + private final PackedInts.Reader values; + + public PackedIntsSource(IndexInput dataIn) throws IOException { + + minValue = dataIn.readLong(); + defaultValue = dataIn.readLong(); + values = PackedInts.getReader(dataIn); } + @Override + public long getInt(int docID) { + // TODO -- can we somehow avoid 2X method calls + // on each get? must push minValue down, and make + // PackedInts implement Ints.Source + assert docID >= 0; + final long value = values.get(docID); + return value == defaultValue ? 0 : minValue + value; + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + return new SourceEnum(attrSource, type(), this, values.size()) { + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) + return pos = NO_MORE_DOCS; + intsRef.ints[intsRef.offset] = source.getInt(target); + return pos = target; + } + }; + } + + @Override + public ValueType type() { + return ValueType.VAR_INTS; + } } + private static final class PackedIntsEnumImpl extends ValuesEnum { private final PackedInts.ReaderIterator ints; private long minValue; @@ -340,10 +350,9 @@ class IntsImpl { private PackedIntsEnumImpl(AttributeSource source, IndexInput dataIn) throws IOException { - super(source, ValueType.INTS); + super(source, ValueType.VAR_INTS); intsRef.offset = 0; this.dataIn = dataIn; - dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); minValue = dataIn.readLong(); defaultValue = dataIn.readLong(); this.ints = PackedInts.getReaderIterator(dataIn); @@ -379,51 +388,7 @@ class IntsImpl { return advance(pos + 1); } } - - private static final class FixedIntsEnumImpl extends ValuesEnum { - private final IndexInput dataIn; - private final int maxDoc; - private int pos = -1; - private FixedIntsEnumImpl(AttributeSource source, IndexInput dataIn) - throws IOException { - super(source, ValueType.INTS); - intsRef.offset = 0; - this.dataIn = dataIn; - dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); - maxDoc = dataIn.readInt(); - } - - @Override - public void close() throws IOException { - dataIn.close(); - } - - @Override - public int advance(int target) throws IOException { - if (target >= maxDoc) { - return pos = NO_MORE_DOCS; - } - assert target > pos; - if (target > pos+1) { - dataIn.seek(dataIn.getFilePointer() + ((target - pos - 1) * 8)); - } - intsRef.ints[intsRef.offset] = dataIn.readLong(); - return pos = target; - } - - @Override - public int docID() { - return pos; - } - - @Override - public int nextDoc() throws IOException { - if (pos >= maxDoc) { - return pos = NO_MORE_DOCS; - } - return advance(pos + 1); - } - } + } \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/index/values/ValueType.java b/lucene/src/java/org/apache/lucene/index/values/ValueType.java index 4680c4eac1b..974d4961205 100644 --- a/lucene/src/java/org/apache/lucene/index/values/ValueType.java +++ b/lucene/src/java/org/apache/lucene/index/values/ValueType.java @@ -18,8 +18,8 @@ package org.apache.lucene.index.values; */ import org.apache.lucene.index.codecs.Codec; -import org.apache.lucene.index.codecs.PerDocConsumer; import org.apache.lucene.index.values.IndexDocValues.SortedSource; +import org.apache.lucene.index.values.IndexDocValues.Source; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.PackedInts; @@ -32,16 +32,14 @@ import org.apache.lucene.util.packed.PackedInts; * @lucene.experimental */ public enum ValueType { - /* - * TODO: Add INT_32 INT_64 INT_16 & INT_8?! - */ + /** - * A 64 bit integer value. By default this type uses + * A variable bit signed integer value. By default this type uses * {@link PackedInts} to compress the values, as an offset * from the minimum value, as long as the value range * fits into 263-1. Otherwise, * the default implementation falls back to fixed size 64bit - * integers. + * integers ({@link #FIXED_INTS_64}). *

* NOTE: this type uses 0 as the default value without any * distinction between provided 0 values during indexing. All @@ -50,13 +48,65 @@ public enum ValueType { * value assigned. Custom default values must be assigned explicitly. *

*/ - INTS, + VAR_INTS, + + /** + * A 8 bit signed integer value. {@link Source} instances of + * this type return a byte array from {@link Source#getArray()} + *

+ * NOTE: this type uses 0 as the default value without any + * distinction between provided 0 values during indexing. All + * documents without an explicit value will use 0 instead. In turn, + * {@link ValuesEnum} instances will not skip documents without an explicit + * value assigned. Custom default values must be assigned explicitly. + *

+ */ + FIXED_INTS_8, + + /** + * A 16 bit signed integer value. {@link Source} instances of + * this type return a short array from {@link Source#getArray()} + *

+ * NOTE: this type uses 0 as the default value without any + * distinction between provided 0 values during indexing. All + * documents without an explicit value will use 0 instead. In turn, + * {@link ValuesEnum} instances will not skip documents without an explicit + * value assigned. Custom default values must be assigned explicitly. + *

+ */ + FIXED_INTS_16, + + /** + * A 32 bit signed integer value. {@link Source} instances of + * this type return a int array from {@link Source#getArray()} + *

+ * NOTE: this type uses 0 as the default value without any + * distinction between provided 0 values during indexing. All + * documents without an explicit value will use 0 instead. In turn, + * {@link ValuesEnum} instances will not skip documents without an explicit + * value assigned. Custom default values must be assigned explicitly. + *

+ */ + FIXED_INTS_32, + /** + * A 64 bit signed integer value. {@link Source} instances of + * this type return a long array from {@link Source#getArray()} + *

+ * NOTE: this type uses 0 as the default value without any + * distinction between provided 0 values during indexing. All + * documents without an explicit value will use 0 instead. In turn, + * {@link ValuesEnum} instances will not skip documents without an explicit + * value assigned. Custom default values must be assigned explicitly. + *

+ */ + FIXED_INTS_64, /** * A 32 bit floating point value. By default there is no compression * applied. To fit custom float values into less than 32bit either a custom * implementation is needed or values must be encoded into a - * {@link #BYTES_FIXED_STRAIGHT} type. + * {@link #BYTES_FIXED_STRAIGHT} type. {@link Source} instances of + * this type return a float array from {@link Source#getArray()} *

* NOTE: this type uses 0.0f as the default value without any * distinction between provided 0.0f values during indexing. All @@ -67,10 +117,12 @@ public enum ValueType { */ FLOAT_32, /** + * * A 64 bit floating point value. By default there is no compression * applied. To fit custom float values into less than 64bit either a custom * implementation is needed or values must be encoded into a - * {@link #BYTES_FIXED_STRAIGHT} type. + * {@link #BYTES_FIXED_STRAIGHT} type. {@link Source} instances of + * this type return a double array from {@link Source#getArray()} *

* NOTE: this type uses 0.0d as the default value without any * distinction between provided 0.0d values during indexing. All diff --git a/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java b/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java index 03512073cde..f7815483503 100644 --- a/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java +++ b/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java @@ -71,13 +71,18 @@ public abstract class ValuesEnum extends DocIdSetIterator { case BYTES_VAR_STRAIGHT: bytesRef = new BytesRef(); break; - case INTS: + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case FIXED_INTS_8: + case VAR_INTS: intsRef = new LongsRef(1); break; case FLOAT_32: case FLOAT_64: floatsRef = new FloatsRef(1); break; + } } diff --git a/lucene/src/java/org/apache/lucene/index/values/Writer.java b/lucene/src/java/org/apache/lucene/index/values/Writer.java index bdb7d65dc7f..5b12082f688 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Writer.java +++ b/lucene/src/java/org/apache/lucene/index/values/Writer.java @@ -197,8 +197,12 @@ public abstract class Writer extends DocValuesConsumer { comp = BytesRef.getUTF8SortedAsUnicodeComparator(); } switch (type) { - case INTS: - return Ints.getWriter(directory, id, true, bytesUsed); + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case FIXED_INTS_8: + case VAR_INTS: + return Ints.getWriter(directory, id, bytesUsed, type); case FLOAT_32: return Floats.getWriter(directory, id, 4, bytesUsed); case FLOAT_64: @@ -221,6 +225,7 @@ public abstract class Writer extends DocValuesConsumer { case BYTES_VAR_SORTED: return Bytes.getWriter(directory, id, Bytes.Mode.SORTED, comp, false, bytesUsed); + default: throw new IllegalArgumentException("Unknown Values: " + type); } diff --git a/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java b/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java index 8b85a6ae587..3e03e692334 100644 --- a/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java +++ b/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java @@ -59,7 +59,7 @@ public class NumericIndexDocValueSource extends ValueSource { } }; - case INTS: + case VAR_INTS: return new DocValues() { @Override public String toString(int doc) { diff --git a/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java b/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java index acf3585e0aa..8338a9a537e 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java @@ -200,9 +200,21 @@ public class RandomIndexWriter implements Closeable { case FLOAT_64: docValuesField.setFloat(random.nextDouble()); break; - case INTS: + case VAR_INTS: + docValuesField.setInt(random.nextLong()); + break; + case FIXED_INTS_16: + docValuesField.setInt(random.nextInt(Short.MAX_VALUE)); + break; + case FIXED_INTS_32: docValuesField.setInt(random.nextInt()); break; + case FIXED_INTS_64: + docValuesField.setInt(random.nextLong()); + break; + case FIXED_INTS_8: + docValuesField.setInt(random.nextInt(128)); + break; default: throw new IllegalArgumentException("no such type: " + type); } diff --git a/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java b/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java index c1365d824ac..3abe6bfb39c 100644 --- a/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java +++ b/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java @@ -170,76 +170,241 @@ public class TestDocValues extends LuceneTestCase { dir.close(); } - public void testInts() throws IOException { - long[] maxMin = new long[] { - Long.MIN_VALUE, Long.MAX_VALUE, - 1, Long.MAX_VALUE, - 0, Long.MAX_VALUE, - -1, Long.MAX_VALUE, - Long.MIN_VALUE, -1, - random.nextInt(), random.nextInt() }; - for (int j = 0; j < maxMin.length; j+=2) { - long maxV = 1; - final int NUM_VALUES = 777 + random.nextInt(777); - final long[] values = new long[NUM_VALUES]; - for (int rx = 1; rx < 63; rx++, maxV *= 2) { - Directory dir = newDirectory(); - final AtomicLong trackBytes = new AtomicLong(0); - Writer w = Ints.getWriter(dir, "test", false, trackBytes); - values[0] = maxMin[j]; - w.add(0, values[0]); - values[1] = maxMin[j+1]; - w.add(1, values[1]); - for (int i = 2; i < NUM_VALUES; i++) { - final long v = random.nextLong() % (1 + maxV); - values[i] = v; - w.add(i, v); - } - final int additionalDocs = 1 + random.nextInt(9); - w.finish(NUM_VALUES + additionalDocs); - assertEquals(0, trackBytes.get()); + public void testVariableIntsLimits() throws IOException { + long[][] minMax = new long[][] { { Long.MIN_VALUE, Long.MAX_VALUE }, + { Long.MIN_VALUE + 1, 1 }, { -1, Long.MAX_VALUE }, + { Long.MIN_VALUE, -1 }, { 1, Long.MAX_VALUE }, + { -1, Long.MAX_VALUE - 1 }, { Long.MIN_VALUE + 2, 1 }, }; + ValueType[] expectedTypes = new ValueType[] { ValueType.FIXED_INTS_64, + ValueType.FIXED_INTS_64, ValueType.FIXED_INTS_64, + ValueType.FIXED_INTS_64, ValueType.VAR_INTS, ValueType.VAR_INTS, + ValueType.VAR_INTS, }; + for (int i = 0; i < minMax.length; i++) { + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Ints.getWriter(dir, "test", trackBytes, ValueType.VAR_INTS); + w.add(0, minMax[i][0]); + w.add(1, minMax[i][1]); + w.finish(2); + assertEquals(0, trackBytes.get()); + IndexDocValues r = Ints.getValues(dir, "test"); + Source source = getSource(r); + assertEquals(i + " with min: " + minMax[i][0] + " max: " + minMax[i][1], + expectedTypes[i], source.type()); + assertEquals(minMax[i][0], source.getInt(0)); + assertEquals(minMax[i][1], source.getInt(1)); + ValuesEnum iEnum = getEnum(r); + assertEquals(i + " with min: " + minMax[i][0] + " max: " + minMax[i][1], + expectedTypes[i], iEnum.type()); + assertEquals(0, iEnum.nextDoc()); + assertEquals(minMax[i][0], iEnum.intsRef.get()); + assertEquals(1, iEnum.nextDoc()); + assertEquals(minMax[i][1], iEnum.intsRef.get()); + assertEquals(ValuesEnum.NO_MORE_DOCS, iEnum.nextDoc()); - IndexDocValues r = Ints.getValues(dir, "test", false); - for (int iter = 0; iter < 2; iter++) { - Source s = getSource(r); - for (int i = 0; i < NUM_VALUES; i++) { - final long v = s.getInt(i); - assertEquals("index " + i, values[i], v); - } - } + r.close(); + dir.close(); + } + } + + public void testVInts() throws IOException { + testInts(ValueType.VAR_INTS, 63); + } + + public void testFixedInts() throws IOException { + testInts(ValueType.FIXED_INTS_64, 63); + testInts(ValueType.FIXED_INTS_32, 31); + testInts(ValueType.FIXED_INTS_16, 15); + testInts(ValueType.FIXED_INTS_8, 7); - for (int iter = 0; iter < 2; iter++) { - ValuesEnum iEnum = getEnum(r); - LongsRef ints = iEnum.getInt(); - for (int i = 0; i < NUM_VALUES + additionalDocs; i++) { - assertEquals(i, iEnum.nextDoc()); - if (i < NUM_VALUES) { - assertEquals(values[i], ints.get()); - } else { - assertEquals(0, ints.get()); - } - } - assertEquals(ValuesEnum.NO_MORE_DOCS, iEnum.nextDoc()); - iEnum.close(); - } + } + + public void testGetInt8Array() throws IOException { + byte[] sourceArray = new byte[] {1,2,3}; + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Ints.getWriter(dir, "test", trackBytes, ValueType.FIXED_INTS_8); + for (int i = 0; i < sourceArray.length; i++) { + w.add(i, (long) sourceArray[i]); + } + w.finish(sourceArray.length); + IndexDocValues r = Ints.getValues(dir, "test"); + Source source = r.getSource(); + assertTrue(source.hasArray()); + byte[] loaded = ((byte[])source.getArray()); + assertEquals(loaded.length, sourceArray.length); + for (int i = 0; i < loaded.length; i++) { + assertEquals("value didn't match at index " + i, sourceArray[i], loaded[i]); + } + r.close(); + dir.close(); + } + + public void testGetInt16Array() throws IOException { + short[] sourceArray = new short[] {1,2,3}; + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Ints.getWriter(dir, "test", trackBytes, ValueType.FIXED_INTS_16); + for (int i = 0; i < sourceArray.length; i++) { + w.add(i, (long) sourceArray[i]); + } + w.finish(sourceArray.length); + IndexDocValues r = Ints.getValues(dir, "test"); + Source source = r.getSource(); + assertTrue(source.hasArray()); + short[] loaded = ((short[])source.getArray()); + assertEquals(loaded.length, sourceArray.length); + for (int i = 0; i < loaded.length; i++) { + assertEquals("value didn't match at index " + i, sourceArray[i], loaded[i]); + } + r.close(); + dir.close(); + } + + public void testGetInt64Array() throws IOException { + long[] sourceArray = new long[] {1,2,3}; + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Ints.getWriter(dir, "test", trackBytes, ValueType.FIXED_INTS_64); + for (int i = 0; i < sourceArray.length; i++) { + w.add(i, sourceArray[i]); + } + w.finish(sourceArray.length); + IndexDocValues r = Ints.getValues(dir, "test"); + Source source = r.getSource(); + assertTrue(source.hasArray()); + long[] loaded = ((long[])source.getArray()); + assertEquals(loaded.length, sourceArray.length); + for (int i = 0; i < loaded.length; i++) { + assertEquals("value didn't match at index " + i, sourceArray[i], loaded[i]); + } + r.close(); + dir.close(); + } + + public void testGetInt32Array() throws IOException { + int[] sourceArray = new int[] {1,2,3}; + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Ints.getWriter(dir, "test", trackBytes, ValueType.FIXED_INTS_32); + for (int i = 0; i < sourceArray.length; i++) { + w.add(i, (long) sourceArray[i]); + } + w.finish(sourceArray.length); + IndexDocValues r = Ints.getValues(dir, "test"); + Source source = r.getSource(); + assertTrue(source.hasArray()); + int[] loaded = ((int[])source.getArray()); + assertEquals(loaded.length, sourceArray.length); + for (int i = 0; i < loaded.length; i++) { + assertEquals("value didn't match at index " + i, sourceArray[i], loaded[i]); + } + r.close(); + dir.close(); + } + + public void testGetFloat32Array() throws IOException { + float[] sourceArray = new float[] {1,2,3}; + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Floats.getWriter(dir, "test", 4, trackBytes); + for (int i = 0; i < sourceArray.length; i++) { + w.add(i, sourceArray[i]); + } + w.finish(sourceArray.length); + IndexDocValues r = Floats.getValues(dir, "test", 3); + Source source = r.getSource(); + assertTrue(source.hasArray()); + float[] loaded = ((float[])source.getArray()); + assertEquals(loaded.length, sourceArray.length); + for (int i = 0; i < loaded.length; i++) { + assertEquals("value didn't match at index " + i, sourceArray[i], loaded[i], 0.0f); + } + r.close(); + dir.close(); + } + + public void testGetFloat64Array() throws IOException { + double[] sourceArray = new double[] {1,2,3}; + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Floats.getWriter(dir, "test", 8, trackBytes); + for (int i = 0; i < sourceArray.length; i++) { + w.add(i, sourceArray[i]); + } + w.finish(sourceArray.length); + IndexDocValues r = Floats.getValues(dir, "test", 3); + Source source = r.getSource(); + assertTrue(source.hasArray()); + double[] loaded = ((double[])source.getArray()); + assertEquals(loaded.length, sourceArray.length); + for (int i = 0; i < loaded.length; i++) { + assertEquals("value didn't match at index " + i, sourceArray[i], loaded[i], 0.0d); + } + r.close(); + dir.close(); + } - for (int iter = 0; iter < 2; iter++) { - ValuesEnum iEnum = getEnum(r); - LongsRef ints = iEnum.getInt(); - for (int i = 0; i < NUM_VALUES + additionalDocs; i += 1 + random.nextInt(25)) { - assertEquals(i, iEnum.advance(i)); - if (i < NUM_VALUES) { - assertEquals(values[i], ints.get()); - } else { - assertEquals(0, ints.get()); - } - } - assertEquals(ValuesEnum.NO_MORE_DOCS, iEnum.advance(NUM_VALUES + additionalDocs)); - iEnum.close(); - } - r.close(); - dir.close(); + private void testInts(ValueType type, int maxBit) throws IOException { + long maxV = 1; + final int NUM_VALUES = 333 + random.nextInt(333); + final long[] values = new long[NUM_VALUES]; + for (int rx = 1; rx < maxBit; rx++, maxV *= 2) { + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Ints.getWriter(dir, "test", trackBytes, type); + for (int i = 0; i < NUM_VALUES; i++) { + final long v = random.nextLong() % (1 + maxV); + values[i] = v; + w.add(i, v); } + final int additionalDocs = 1 + random.nextInt(9); + w.finish(NUM_VALUES + additionalDocs); + assertEquals(0, trackBytes.get()); + + IndexDocValues r = Ints.getValues(dir, "test"); + for (int iter = 0; iter < 2; iter++) { + Source s = getSource(r); + assertEquals(type, s.type()); + for (int i = 0; i < NUM_VALUES; i++) { + final long v = s.getInt(i); + assertEquals("index " + i, values[i], v); + } + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = getEnum(r); + assertEquals(type, iEnum.type()); + LongsRef ints = iEnum.getInt(); + for (int i = 0; i < NUM_VALUES + additionalDocs; i++) { + assertEquals(i, iEnum.nextDoc()); + if (i < NUM_VALUES) { + assertEquals(values[i], ints.get()); + } else { + assertEquals(0, ints.get()); + } + } + assertEquals(ValuesEnum.NO_MORE_DOCS, iEnum.nextDoc()); + iEnum.close(); + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = getEnum(r); + assertEquals(type, iEnum.type()); + LongsRef ints = iEnum.getInt(); + for (int i = 0; i < NUM_VALUES + additionalDocs; i += 1 + random.nextInt(25)) { + assertEquals(i, iEnum.advance(i)); + if (i < NUM_VALUES) { + assertEquals(values[i], ints.get()); + } else { + assertEquals(0, ints.get()); + } + } + assertEquals(ValuesEnum.NO_MORE_DOCS, iEnum.advance(NUM_VALUES + additionalDocs)); + iEnum.close(); + } + r.close(); + dir.close(); } } diff --git a/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java b/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java index 325336d1e64..54a04be0e98 100644 --- a/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java +++ b/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java @@ -113,44 +113,20 @@ public class TestDocValuesIndexing extends LuceneTestCase { dir.close(); } - /** - * Tests complete indexing of {@link ValueType} including deletions, merging and - * sparse value fields on Compound-File - */ - public void testIndexBytesNoDeletesCFS() throws IOException { - runTestIndexBytes(writerConfig(true), false); - } - - public void testIndexBytesDeletesCFS() throws IOException { - runTestIndexBytes(writerConfig(true), true); - } - - public void testIndexNumericsNoDeletesCFS() throws IOException { - runTestNumerics(writerConfig(true), false); - } - - public void testIndexNumericsDeletesCFS() throws IOException { - runTestNumerics(writerConfig(true), true); - } - - /** - * Tests complete indexing of {@link ValueType} including deletions, merging and - * sparse value fields on None-Compound-File - */ public void testIndexBytesNoDeletes() throws IOException { - runTestIndexBytes(writerConfig(false), false); + runTestIndexBytes(writerConfig(random.nextBoolean()), false); } public void testIndexBytesDeletes() throws IOException { - runTestIndexBytes(writerConfig(false), true); + runTestIndexBytes(writerConfig(random.nextBoolean()), true); } public void testIndexNumericsNoDeletes() throws IOException { - runTestNumerics(writerConfig(false), false); + runTestNumerics(writerConfig(random.nextBoolean()), false); } public void testIndexNumericsDeletes() throws IOException { - runTestNumerics(writerConfig(false), true); + runTestNumerics(writerConfig(random.nextBoolean()), true); } public void testAddIndexes() throws IOException { @@ -204,7 +180,11 @@ public class TestDocValuesIndexing extends LuceneTestCase { case BYTES_VAR_STRAIGHT: case FLOAT_32: case FLOAT_64: - case INTS: + case VAR_INTS: + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case FIXED_INTS_8: assertEquals(msg, valuesPerIndex-1, vE_2_merged.advance(valuesPerIndex-1)); } @@ -246,7 +226,7 @@ public class TestDocValuesIndexing extends LuceneTestCase { throws IOException { Directory d = newDirectory(); IndexWriter w = new IndexWriter(d, cfg); - final int numValues = 179 + random.nextInt(151); + final int numValues = 50 + atLeast(10); final List numVariantList = new ArrayList(NUMERICS); // run in random order to test if fill works correctly during merges @@ -258,8 +238,16 @@ public class TestDocValuesIndexing extends LuceneTestCase { IndexReader r = IndexReader.open(w, true); final int numRemainingValues = (int) (numValues - deleted.cardinality()); final int base = r.numDocs() - numRemainingValues; + // for FIXED_INTS_8 we use value mod 128 - to enable testing in + // one go we simply use numValues as the mod for all other INT types + int mod = numValues; switch (val) { - case INTS: { + case FIXED_INTS_8: + mod = 128; + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case VAR_INTS: { IndexDocValues intsReader = getDocValues(r, val.name()); assertNotNull(intsReader); @@ -283,8 +271,8 @@ public class TestDocValuesIndexing extends LuceneTestCase { } assertEquals("advance failed at index: " + i + " of " + r.numDocs() + " docs", i, intsEnum.advance(i)); - assertEquals(expected, ints.getInt(i)); - assertEquals(expected, enumRef.get()); + assertEquals(val + "" + mod + " " + i, expected%mod, ints.getInt(i)); + assertEquals(expected%mod, enumRef.get()); } } @@ -338,11 +326,11 @@ public class TestDocValuesIndexing extends LuceneTestCase { final List byteVariantList = new ArrayList(BYTES); // run in random order to test if fill works correctly during merges Collections.shuffle(byteVariantList, random); - final int numValues = 179 + random.nextInt(151); + final int numValues = 50 + atLeast(10); for (ValueType byteIndexValue : byteVariantList) { List closeables = new ArrayList(); - int bytesSize = 1 + random.nextInt(128); + int bytesSize = 1 + atLeast(10); OpenBitSet deleted = indexValues(w, numValues, byteIndexValue, byteVariantList, withDeletions, bytesSize); final IndexReader r = IndexReader.open(w, withDeletions); @@ -485,8 +473,12 @@ public class TestDocValuesIndexing extends LuceneTestCase { ValueType.BYTES_FIXED_SORTED, ValueType.BYTES_FIXED_STRAIGHT, ValueType.BYTES_VAR_DEREF, ValueType.BYTES_VAR_SORTED, ValueType.BYTES_VAR_STRAIGHT); - private static EnumSet NUMERICS = EnumSet.of(ValueType.INTS, - ValueType.FLOAT_32, ValueType.FLOAT_64); + private static EnumSet NUMERICS = EnumSet.of(ValueType.VAR_INTS, + ValueType.FIXED_INTS_16, ValueType.FIXED_INTS_32, + ValueType.FIXED_INTS_64, + ValueType.FIXED_INTS_8, + ValueType.FLOAT_32, + ValueType.FLOAT_64); private static Index[] IDX_VALUES = new Index[] { Index.ANALYZED, Index.ANALYZED_NO_NORMS, Index.NOT_ANALYZED, Index.NOT_ANALYZED_NO_NORMS, @@ -517,8 +509,20 @@ public class TestDocValuesIndexing extends LuceneTestCase { for (int i = 0; i < numValues; i++) { if (isNumeric) { switch (value) { - case INTS: - valField.setInt(i); + case VAR_INTS: + valField.setInt((long)i); + break; + case FIXED_INTS_16: + valField.setInt((short)i, random.nextInt(10) != 0); + break; + case FIXED_INTS_32: + valField.setInt(i, random.nextInt(10) != 0); + break; + case FIXED_INTS_64: + valField.setInt((long)i, random.nextInt(10) != 0); + break; + case FIXED_INTS_8: + valField.setInt((byte)(0xFF & (i % 128)), random.nextInt(10) != 0); break; case FLOAT_32: valField.setFloat(2.0f * i); @@ -526,6 +530,7 @@ public class TestDocValuesIndexing extends LuceneTestCase { case FLOAT_64: valField.setFloat(2.0d * i); break; + default: fail("unexpected value " + value); } diff --git a/lucene/src/test/org/apache/lucene/search/TestSort.java b/lucene/src/test/org/apache/lucene/search/TestSort.java index 30a044861a1..9e355d2fbb6 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSort.java +++ b/lucene/src/test/org/apache/lucene/search/TestSort.java @@ -124,7 +124,7 @@ public class TestSort extends LuceneTestCase { doc.add (new Field ("contents", data[i][1], Field.Store.NO, Field.Index.ANALYZED)); if (data[i][2] != null) { Field f = supportsDocValues ? - IndexDocValuesField.set(new Field ("int", data[i][2], Field.Store.NO, Field.Index.NOT_ANALYZED), ValueType.INTS) + IndexDocValuesField.set(new Field ("int", data[i][2], Field.Store.NO, Field.Index.NOT_ANALYZED), ValueType.VAR_INTS) : new Field ("int", data[i][2], Field.Store.NO, Field.Index.NOT_ANALYZED); doc.add(f); }