LUCENE-5084: EliasFanoDocIdSet.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1501576 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-07-09 21:49:04 +00:00
parent 4617b8da89
commit f3ca2faaf9
6 changed files with 1233 additions and 0 deletions

View File

@ -298,6 +298,9 @@ New Features
* LUCENE-5013: Added ScandinavianFoldingFilterFactory and
ScandinavianNormalizationFilterFactory (Karl Wettin via janhoy)
* LUCENE-5084: Added new Elias-Fano encoder, decoder and DocIdSet
implementations. (Paul Elschot via Adrien Grand)
API Changes
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes

View File

@ -0,0 +1,387 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.packed;
/** A decoder for an {@link EliasFanoEncoder}.
* @lucene.internal
*/
public class EliasFanoDecoder {
private static final int LOG2_LONG_SIZE = Long.numberOfTrailingZeros(Long.SIZE);
private final EliasFanoEncoder efEncoder;
final long numEncoded;
private long efIndex = -1; // the decoding index.
private long setBitForIndex = -1; // the index of the high bit at the decoding index.
public final static long NO_MORE_VALUES = -1L;
/** Construct a decoder for a given {@link EliasFanoEncoder}.
* The decoding index is set to just before the first encoded value.
*/
public EliasFanoDecoder(EliasFanoEncoder efEncoder) {
this.efEncoder = efEncoder;
this.numEncoded = efEncoder.numEncoded; // numEncoded is not final in EliasFanoEncoder
}
/** @return The Elias-Fano encoder that is decoded. */
public EliasFanoEncoder getEliasFanoEncoder() {
return efEncoder;
}
/** @return The index of the last decoded value.
* The first value encoded by {@link EliasFanoEncoder#encodeNext} has index 0.
* Only valid directly after
* {@link #nextValue}, {@link #advanceToValue},
* {@link #previousValue}, or {@link #backToValue}
* returned another value than {@link #NO_MORE_VALUES}.
*/
public long index() {
if (efIndex < 0) {
throw new IllegalStateException("index before sequence");
}
if (efIndex >= numEncoded) {
throw new IllegalStateException("index after sequence");
}
return efIndex;
}
/** @return The high value for the current decoding index. */
private long currentHighValue() {
return setBitForIndex - efIndex; // sequence of unary gaps
}
/** @return The low value for the current decoding index. */
private long currentLowValue() {
assert efIndex >= 0;
assert efIndex < numEncoded;
if (efEncoder.numLowBits == 0) {
return 0;
}
long bitPos = efIndex * efEncoder.numLowBits;
int lowIndex = (int) (bitPos >>> LOG2_LONG_SIZE);
int bitPosAtIndex = (int) (bitPos & (Long.SIZE-1));
long lowValue = efEncoder.lowerLongs[lowIndex] >>> bitPosAtIndex;
if ((bitPosAtIndex + efEncoder.numLowBits) > Long.SIZE) {
lowValue |= (efEncoder.lowerLongs[lowIndex + 1] << (Long.SIZE - bitPosAtIndex));
}
lowValue &= efEncoder.lowerBitsMask;
return lowValue;
}
/** @return The given highValue shifted left by the number of low bits from by the EliasFanoSequence,
* logically OR-ed with the given lowValue.
*/
private long combineHighLowValues(long highValue, long lowValue) {
return (highValue << efEncoder.numLowBits) | lowValue;
}
private long curHighLong;
/* The implementation of forward decoding and backward decoding is done by the following method pairs.
*
* toBeforeSequence - toAfterSequence
* getCurrentRightShift - getCurrentLeftShift
* toAfterCurrentHighBit - toBeforeCurrentHighBit
* toNextHighLong - toPreviousHighLong
* nextHighValue - previousHighValue
* nextValue - previousValue
* advanceToValue - backToValue
*
*/
/* Forward decoding section */
/** Set the decoding index to just before the first encoded value.
*/
public void toBeforeSequence() {
efIndex = -1;
setBitForIndex = -1;
}
/** @return the number of bits in a long after (setBitForIndex modulo Long.SIZE) */
private int getCurrentRightShift() {
int s = (int) (setBitForIndex & (Long.SIZE-1));
return s;
}
/** Increment efIndex and setBitForIndex and
* shift curHighLong so that it does not contain the high bits before setBitForIndex.
* @return true iff efIndex still smaller than numEncoded.
*/
private boolean toAfterCurrentHighBit() {
efIndex += 1;
if (efIndex >= numEncoded) {
return false;
}
setBitForIndex += 1;
int highIndex = (int)(setBitForIndex >>> LOG2_LONG_SIZE);
curHighLong = efEncoder.upperLongs[highIndex] >>> getCurrentRightShift();
return true;
}
/** The current high long has been determined to not contain the set bit that is needed.
* Increment setBitForIndex to the next high long and set curHighLong accordingly.
*/
private void toNextHighLong() {
setBitForIndex += Long.SIZE - (setBitForIndex & (Long.SIZE-1));
//assert getCurrentRightShift() == 0;
int highIndex = (int)(setBitForIndex >>> LOG2_LONG_SIZE);
curHighLong = efEncoder.upperLongs[highIndex];
}
/** setBitForIndex and efIndex have just been incremented, scan to the next high set bit
* by incrementing setBitForIndex, and by setting curHighLong accordingly.
*/
private void toNextHighValue() {
while (curHighLong == 0L) {
toNextHighLong(); // inlining and unrolling would simplify somewhat
}
setBitForIndex += Long.numberOfTrailingZeros(curHighLong);
}
/** setBitForIndex and efIndex have just been incremented, scan to the next high set bit
* by incrementing setBitForIndex, and by setting curHighLong accordingly.
* @return the next encoded high value.
*/
private long nextHighValue() {
toNextHighValue();
return currentHighValue();
}
/** If another value is available after the current decoding index, return this value and
* and increase the decoding index by 1. Otherwise return {@link #NO_MORE_VALUES}.
*/
public long nextValue() {
if (! toAfterCurrentHighBit()) {
return NO_MORE_VALUES;
}
long highValue = nextHighValue();
return combineHighLowValues(highValue, currentLowValue());
}
/** Advance the decoding index to a given index.
* and return <code>true</code> iff it is available.
*/
public boolean advanceToIndex(long index) {
assert index > efIndex;
if (index >= numEncoded) {
efIndex = numEncoded;
return false;
}
if (! toAfterCurrentHighBit()) {
assert false;
}
int curSetBits = Long.bitCount(curHighLong);
while ((efIndex + curSetBits) < index) { // curHighLong has not enough set bits to reach index
efIndex += curSetBits;
toNextHighLong();
curSetBits = Long.bitCount(curHighLong);
}
// curHighLong has enough set bits to reach index
while (efIndex < index) {
/* CHECKME: Instead of the linear search here, use (forward) broadword selection from
* "Broadword Implementation of Rank/Select Queries", Sebastiano Vigna, January 30, 2012.
*/
if (! toAfterCurrentHighBit()) {
assert false;
}
toNextHighValue();
}
return true;
}
/** setBitForIndex and efIndex have just been incremented, scan forward to the high set bit
* of at least a given high value
* by incrementing setBitForIndex, and by setting curHighLong accordingly.
* @return the smallest encoded high value that is at least the given one.
*/
private long advanceToHighValue(long highTarget) {
int curSetBits = Long.bitCount(curHighLong); // is shifted by getCurrentRightShift()
int curClearBits = Long.SIZE - curSetBits - getCurrentRightShift();
while ((currentHighValue() + curClearBits) < highTarget) {
// curHighLong has not enough clear bits to reach highTarget
efIndex += curSetBits;
if (efIndex >= numEncoded) {
return NO_MORE_VALUES;
}
toNextHighLong();
// assert getCurrentRightShift() == 0;
curSetBits = Long.bitCount(curHighLong);
curClearBits = Long.SIZE - curSetBits;
}
// curHighLong has enough clear bits to reach highTarget, but may not have enough set bits.
long highValue = nextHighValue();
while (highValue < highTarget) {
/* CHECKME: Instead of the linear search here, use (forward) broadword selection from
* "Broadword Implementation of Rank/Select Queries", Sebastiano Vigna, January 30, 2012.
*/
if (! toAfterCurrentHighBit()) {
return NO_MORE_VALUES;
}
highValue = nextHighValue();
}
return highValue;
}
/** Given a target value, advance the decoding index to the first bigger or equal value
* and return it if it is available. Otherwise return {@link #NO_MORE_VALUES}.
*/
public long advanceToValue(long target) {
if (! toAfterCurrentHighBit()) {
return NO_MORE_VALUES;
}
long highTarget = target >>> efEncoder.numLowBits;
long highValue = advanceToHighValue(highTarget);
if (highValue == NO_MORE_VALUES) {
return NO_MORE_VALUES;
}
// Linear search with low values:
long currentValue = combineHighLowValues(highValue, currentLowValue());
while (currentValue < target) {
currentValue = nextValue();
if (currentValue == NO_MORE_VALUES) {
return NO_MORE_VALUES;
}
}
return currentValue;
}
/* Backward decoding section */
/** Set the decoding index to just after the last encoded value.
*/
public void toAfterSequence() {
efIndex = numEncoded; // just after last index
setBitForIndex = (efEncoder.lastEncoded >>> efEncoder.numLowBits) + numEncoded;
}
/** @return the number of bits in a long before (setBitForIndex modulo Long.SIZE) */
private int getCurrentLeftShift() {
int s = Long.SIZE - 1 - (int) (setBitForIndex & (Long.SIZE-1));
return s;
}
/** Decrement efindex and setBitForIndex and
* shift curHighLong so that it does not contain the high bits after setBitForIndex.
* @return true iff efindex still >= 0
*/
private boolean toBeforeCurrentHighBit() {
efIndex -= 1;
if (efIndex < 0) {
return false;
}
setBitForIndex -= 1;
int highIndex = (int)(setBitForIndex >>> LOG2_LONG_SIZE);
curHighLong = efEncoder.upperLongs[highIndex] << getCurrentLeftShift();
return true;
}
/** The current high long has been determined to not contain the set bit that is needed.
* Decrement setBitForIndex to the previous high long and set curHighLong accordingly.
*/
private void toPreviousHighLong() {
setBitForIndex -= (setBitForIndex & (Long.SIZE-1)) + 1;
//assert getCurrentLeftShift() == 0;
int highIndex = (int)(setBitForIndex >>> LOG2_LONG_SIZE);
curHighLong = efEncoder.upperLongs[highIndex];
}
/** setBitForIndex and efIndex have just been decremented, scan to the previous high set bit
* by decrementing setBitForIndex and by setting curHighLong accordingly.
* @return the previous encoded high value.
*/
private long previousHighValue() {
while (curHighLong == 0L) {
toPreviousHighLong(); // inlining and unrolling would simplify somewhat
}
setBitForIndex -= Long.numberOfLeadingZeros(curHighLong);
return currentHighValue();
}
/** If another value is available before the current decoding index, return this value and
* and decrease the decoding index by 1. Otherwise return {@link #NO_MORE_VALUES}.
*/
public long previousValue() {
if (! toBeforeCurrentHighBit()) {
return NO_MORE_VALUES;
}
long highValue = previousHighValue();
return combineHighLowValues(highValue, currentLowValue());
}
/** setBitForIndex and efIndex have just been decremented, scan backward to the high set bit
* of at most a given high value
* by decrementing setBitForIndex and by setting curHighLong accordingly.
* @return the largest encoded high value that is at most the given one.
*/
private long backToHighValue(long highTarget) {
int curSetBits = Long.bitCount(curHighLong); // is shifted by getCurrentLeftShift()
int curClearBits = Long.SIZE - curSetBits - getCurrentLeftShift();
while ((currentHighValue() - curClearBits) > highTarget) {
// curHighLong has not enough clear bits to reach highTarget
efIndex -= curSetBits;
if (efIndex < 0) {
return NO_MORE_VALUES;
}
toPreviousHighLong();
//assert getCurrentLeftShift() == 0;
curSetBits = Long.bitCount(curHighLong);
curClearBits = Long.SIZE - curSetBits;
}
// curHighLong has enough clear bits to reach highTarget, but may not have enough set bits.
long highValue = previousHighValue();
while (highValue > highTarget) {
/* CHECKME: See at advanceToHighValue. */
if (! toBeforeCurrentHighBit()) {
return NO_MORE_VALUES;
}
highValue = previousHighValue();
}
return highValue;
}
/** Given a target value, go back to the first smaller or equal value
* and return it if it is available. Otherwise return {@link #NO_MORE_VALUES}.
*/
public long backToValue(long target) {
if (! toBeforeCurrentHighBit()) {
return NO_MORE_VALUES;
}
long highTarget = target >>> efEncoder.numLowBits;
long highValue = backToHighValue(highTarget);
if (highValue == NO_MORE_VALUES) {
return NO_MORE_VALUES;
}
// Linear search with low values:
long currentValue = combineHighLowValues(highValue, currentLowValue());
while (currentValue > target) {
currentValue = previousValue();
if (currentValue == NO_MORE_VALUES) {
return NO_MORE_VALUES;
}
}
return currentValue;
}
}

View File

@ -0,0 +1,111 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.packed;
import java.io.IOException;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/** A DocIdSet in Elias-Fano encoding.
* @lucene.internal
*/
public class EliasFanoDocIdSet extends DocIdSet {
final EliasFanoEncoder efEncoder;
/*
* Construct an EliasFanoDocIdSet.
* @param numValues The number of values that can be encoded.
* @param upperBound At least the highest value that will be encoded.
*/
public EliasFanoDocIdSet(long numValues, long upperBound) {
efEncoder = new EliasFanoEncoder(numValues, upperBound);
}
public void encodeFromDisi(DocIdSetIterator disi) throws IOException {
while (efEncoder.numEncoded < efEncoder.numValues) {
int x = disi.nextDoc();
if (x == DocIdSetIterator.NO_MORE_DOCS) {
throw new IllegalArgumentException("disi: " + disi.toString()
+ "\nhas " + efEncoder.numEncoded
+ " docs, but at least " + efEncoder.numValues + " are required.");
}
efEncoder.encodeNext(x);
}
}
/**
* Provides a {@link DocIdSetIterator} to access encoded document ids.
*/
@Override
public DocIdSetIterator iterator() {
if (efEncoder.lastEncoded >= DocIdSetIterator.NO_MORE_DOCS) {
throw new UnsupportedOperationException(
"Highest encoded value too high for DocIdSetIterator.NO_MORE_DOCS: " + efEncoder.lastEncoded);
}
return new DocIdSetIterator() {
private int curDocId = -1;
private final EliasFanoDecoder efDecoder = efEncoder.getDecoder();
@Override
public int docID() {
return curDocId;
}
private int setCurDocID(long nextValue) {
curDocId = (nextValue == EliasFanoDecoder.NO_MORE_VALUES)
? NO_MORE_DOCS
: (int) nextValue;
return curDocId;
}
@Override
public int nextDoc() {
return setCurDocID(efDecoder.nextValue());
}
@Override
public int advance(int target) {
return setCurDocID(efDecoder.advanceToValue(target));
}
@Override
public long cost() {
return efDecoder.numEncoded;
}
};
}
/** This DocIdSet implementation is cacheable. @return <code>true</code> */
@Override
public boolean isCacheable() {
return true;
}
@Override
public boolean equals(Object other) {
return ((other instanceof EliasFanoDocIdSet))
&& efEncoder.equals(((EliasFanoDocIdSet) other).efEncoder);
}
@Override
public int hashCode() {
return efEncoder.hashCode() ^ getClass().hashCode();
}
}

View File

@ -0,0 +1,295 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.packed;
import java.util.Arrays;
import org.apache.lucene.util.FixedBitSet; // for javadocs
/** Encode a non decreasing sequence of non negative whole numbers in the Elias-Fano encoding
* that was introduced in the 1970's by Peter Elias and Robert Fano.
* <p>
* The Elias-Fano encoding is a high bits / low bits representation of
* a monotonically increasing sequence of <code>numValues > 0</code> natural numbers <code>x[i]</code>
* <p>
* <code>0 <= x[0] <= x[1] <= ... <= x[numValues-2] <= x[numValues-1] <= upperBound</code>
* <p>
* where <code>upperBound > 0</code> is an upper bound on the last value.
* <br>
* The Elias-Fano encoding uses less than half a bit per encoded number more
* than the smallest representation
* that can encode any monotone sequence with the same bounds.
* <p>
* The lower <code>L</code> bits of each <code>x[i]</code> are stored explicitly and contiguously
* in the lower-bits array, with <code>L</code> chosen as (<code>log()</code> base 2):
* <p>
* <code>L = max(0, floor(log(upperBound/numValues)))</code>
* <p>
* The upper bits are stored in the upper-bits array as a sequence of unary-coded gaps (<code>x[-1] = 0</code>):
* <p>
* <code>(x[i]/2**L) - (x[i-1]/2**L)</code>
* <p>
* The unary code encodes a natural number <code>n</code> by <code>n</code> 0 bits followed by a 1 bit:
* <code>0...01</code>. <br>
* In the upper bits the total the number of 1 bits is <code>numValues</code>
* and the total number of 0 bits is:<p>
* <code>floor(x[numValues-1]/2**L) <= upperBound/(2**max(0, floor(log(upperBound/numValues)))) <= 2*numValues</code>
* <p>
* The Elias-Fano encoding uses at most
* <p>
* <code>2 + ceil(log(upperBound/numValues))</code>
* <p>
* bits per encoded number. With <code>upperBound</code> in these bounds (<code>p</code> is an integer):
* <p>
* <code>2**p < x[numValues-1] <= upperBound <= 2**(p+1)</code>
* <p>
* the number of bits per encoded number is minimized.
* <p>
* In this implementation the values in the sequence can be given as <code>long</code>,
* <code>numValues = 0</code> and <code>upperBound = 0</code> are allowed,
* and each of the upper and lower bit arrays should fit in a <code>long[]</code>.
* <p>
* This implementation is based on this article:
* <br>
* Sebastiano Vigna, "Quasi Succinct Indices", June 19, 2012, sections 3 and 4.
* Retrieved from http://arxiv.org/pdf/1206.4300 .
*
* <p>The articles originally describing the Elias-Fano representation are:
* <br>Peter Elias, "Efficient storage and retrieval by content and address of static files",
* J. Assoc. Comput. Mach., 21(2):246260, 1974.
* <br>Robert M. Fano, "On the number of bits required to implement an associative memory",
* Memorandum 61, Computer Structures Group, Project MAC, MIT, Cambridge, Mass., 1971.
*
* @lucene.internal
*/
public class EliasFanoEncoder {
final long numValues;
private final long upperBound;
final int numLowBits;
final long lowerBitsMask;
final long[] upperLongs;
final long[] lowerLongs;
private static final int LOG2_LONG_SIZE = Long.numberOfTrailingZeros(Long.SIZE);
long numEncoded = 0L;
long lastEncoded = 0L;
/**
* Construct an Elias-Fano encoder.
* After construction, call {@link #encodeNext} <code>numValues</code> times to encode
* a non decreasing sequence of non negative numbers.
* @param numValues The number of values that is to be encoded.
* @param upperBound At least the highest value that will be encoded.
* For space efficiency this should not exceed the power of two that equals
* or is the first higher than the actual maximum.
* <br>When <code>numValues >= (upperBound/3)</code>
* a {@link FixedBitSet} will take less space.
* @throws IllegalArgumentException when:
* <ul>
* <li><code>numValues</code> is negative, or
* <li><code>numValues</code> is non negative and <code>upperBound</code> is negative, or
* <li>the low bits do not fit in a <code>long[]</code>:
* <code>(L * numValues / 64) > Integer.MAX_VALUE</code>, or
* <li>the high bits do not fit in a <code>long[]</code>:
* <code>(2 * numValues / 64) > Integer.MAX_VALUE</code>.
* </ul>
*/
public EliasFanoEncoder(long numValues, long upperBound) {
if (numValues < 0L) {
throw new IllegalArgumentException("numValues should not be negative: " + numValues);
}
this.numValues = numValues;
if ((numValues > 0L) && (upperBound < 0L)) {
throw new IllegalArgumentException("upperBound should not be negative: " + upperBound + " when numValues > 0");
}
this.upperBound = upperBound;
int nLowBits = 0;
if (this.numValues > 0) { // nLowBits = max(0; floor(2log(upperBound/numValues)))
long lowBitsFac = this.upperBound / this.numValues;
if (lowBitsFac > 0) {
nLowBits = 63 - Long.numberOfLeadingZeros(lowBitsFac); // see Long.numberOfLeadingZeros javadocs
}
}
this.numLowBits = nLowBits;
this.lowerBitsMask = Long.MAX_VALUE >>> (Long.SIZE - 1 - this.numLowBits);
long numLongsForLowBits = numLongsForBits(numValues * numLowBits);
if (numLongsForLowBits > Integer.MAX_VALUE) {
throw new IllegalArgumentException("numLongsForLowBits too large to index a long array: " + numLongsForLowBits);
}
this.lowerLongs = new long[(int) numLongsForLowBits];
long numHighBitsClear = ((this.upperBound > 0) ? this.upperBound : 0) >>> this.numLowBits;
assert numHighBitsClear <= (2 * this.numValues);
long numHighBitsSet = this.numValues;
long numLongsForHighBits = numLongsForBits(numHighBitsClear + numHighBitsSet);
if (numLongsForHighBits > Integer.MAX_VALUE) {
throw new IllegalArgumentException("numLongsForHighBits too large to index a long array: " + numLongsForHighBits);
}
this.upperLongs = new long[(int) numLongsForHighBits];
}
private static long numLongsForBits(long numBits) {
assert numBits >= 0 : numBits;
return (numBits + (Long.SIZE-1)) >>> LOG2_LONG_SIZE;
}
/** Call at most <code>numValues</code> times to encode a non decreasing sequence of non negative numbers.
* @param x The next number to be encoded.
* @throws IllegalArgumentException when:
* <ul>
* <li>called more than <code>numValues</code> times, or
* <li><code>x</code> is smaller than an earlier encoded value, or
* <li><code>x</code> is larger than <code>upperBound</code>.
* </ul>
*/
public void encodeNext(long x) {
if (numEncoded >= numValues) {
throw new IllegalStateException("encodeNext called more than " + numValues + " times.");
}
if (lastEncoded > x) {
throw new IllegalArgumentException(x + " smaller than previous " + lastEncoded);
}
if (x > upperBound) {
throw new IllegalArgumentException(x + " larger than upperBound " + upperBound);
}
encodeUpperBits(x >>> numLowBits);
encodeLowerBits(x & lowerBitsMask);
numEncoded++;
lastEncoded = x;
}
private void encodeUpperBits(long highValue) {
long nextHighBitNum = numEncoded + highValue; // sequence of unary gaps
upperLongs[(int)(nextHighBitNum >>> LOG2_LONG_SIZE)] |= (1L << (nextHighBitNum & (Long.SIZE-1)));
}
private void encodeLowerBits(long lowValue) {
packValue(lowValue, lowerLongs, numLowBits, numEncoded);
}
private static void packValue(long value, long[] longArray, int numBits, long packIndex) {
if (numBits != 0) {
long bitPos = numBits * packIndex;
int index = (int) (bitPos >>> LOG2_LONG_SIZE);
int bitPosAtIndex = (int) (bitPos & (Long.SIZE-1));
longArray[index] |= (value << bitPosAtIndex);
if ((bitPosAtIndex + numBits) > Long.SIZE) {
longArray[index+1] = (value >>> (Long.SIZE - bitPosAtIndex));
}
}
}
/** Provide an indication that is better to use an {@link EliasFanoEncoder} than a {@link FixedBitSet}
* to encode document identifiers.
* This indication is not precise and may change in the future.
* <br>An EliasFanoEncoder is favoured when the size of the encoding by the EliasFanoEncoder
* is at most 5/6 of the size of the FixedBitSet.
* <br>This condition is the same as comparing estimates of the number of bits accessed by a pair of FixedBitSets and
* by a pair of non indexed EliasFanoDocIdSets when determining the intersections of the pairs.
* @param numValues The number of document identifiers that is to be encoded. Should be non negative.
* @param upperBound The maximum possible value for a document identifier. Should be at least numValues.
*/
public static boolean sufficientlySmallerThanBitSet(long numValues, long upperBound) {
/* When (upperBound / 6) == numValues,
* the number of bits per entry for the EliasFanoEncoder is 2 + ceil(2log(upperBound/numValues)) == 5.
*/
/* For intersecting two bit sets upperBound bits are accessed, roughly half of one, half of the other.
* For intersecting two EliasFano sequences without index on the upper bits,
* all (2 * 3 * numValues) upper bits are accessed.
*/
return (upperBound / 6) > numValues;
}
/**
* @return An {@link EliasFanoDecoder} to access the encoded values.
* Perform all calls to {@link #encodeNext} before calling {@link #getDecoder}.
*/
public EliasFanoDecoder getDecoder() {
// decode as far as currently encoded as determined by numEncoded.
return new EliasFanoDecoder(this);
}
/** Expert. The low bits. */
public long[] getLowerBits() {
return lowerLongs;
}
/** Expert. The high bits. */
public long[] getUpperBits() {
return upperLongs;
}
@Override
public String toString() {
StringBuilder s = new StringBuilder("EliasFanoSequence");
s.append(" numValues " + numValues);
s.append(" numEncoded " + numEncoded);
s.append(" upperBound " + upperBound);
s.append(" lastEncoded " + lastEncoded);
s.append(" numLowBits " + numLowBits);
s.append("\nupperLongs[" + upperLongs.length + "]");
for (int i = 0; i < upperLongs.length; i++) {
s.append(" " + longHex(upperLongs[i]));
}
s.append("\nlowerLongs[" + lowerLongs.length + "]");
for (int i = 0; i < lowerLongs.length; i++) {
s.append(" " + longHex(lowerLongs[i]));
}
return s.toString();
}
@Override
public boolean equals(Object other) {
if (! (other instanceof EliasFanoEncoder)) {
return false;
}
EliasFanoEncoder oefs = (EliasFanoEncoder) other;
// no equality needed for upperBound
return (this.numValues == oefs.numValues)
&& (this.numEncoded == oefs.numEncoded)
&& (this.numLowBits == oefs.numLowBits)
&& Arrays.equals(this.upperLongs, oefs.upperLongs)
&& Arrays.equals(this.lowerLongs, oefs.lowerLongs);
}
@Override
public int hashCode() {
int h = ((int) (numValues + numEncoded))
^ numLowBits
^ Arrays.hashCode(upperLongs)
^ Arrays.hashCode(lowerLongs);
return h;
}
public static String longHex(long x) {
String hx = Long.toHexString(x);
StringBuilder sb = new StringBuilder("0x");
int l = 16 - hx.length();
while (l > 0) {
sb.append('0');
l--;
}
sb.append(hx);
return sb.toString();
}
}

View File

@ -0,0 +1,156 @@
package org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.LuceneTestCase;
public class TestEliasFanoDocIdSet extends LuceneTestCase {
private static DocIdSetIterator makeDisi(final int[] docIds) {
class IntArrayDisi extends DocIdSetIterator {
int i = 0;
int docId = -1;
@Override
public int docID() {
return docId;
}
@Override
public int nextDoc() {
if (i >= docIds.length) {
docId = NO_MORE_DOCS;
return docId;
}
if (docIds[i] < docId) { // Elias-Fano sequence should be non decreasing.
// The non decreasing condition for Elias-Fano is weaker than normal increasing for DocIdSetIterator
throw new AssertionError("docIds[] out of order");
}
docId = docIds[i++]; // increase i to just after current
return docId;
}
@Override
public int advance(int target) {
// ( ((i == 0) and (docId == -1)) or
// ((i > 0) and (docIds.length > 0) and (i <= docIds.length) and (docId == docIds[i-1])) )
// The behavior of this method is undefined when called with target current, or after the iterator has exhausted.
// Both cases may result in unpredicted behavior, and may throw an assertion error or an IOOBE here.
// So when nextDoc() or advance() were called earlier, the target should be bigger than current docId:
assert (docId == -1) || (docId < target);
// Do a binary search for the index j for which:
// ((j >= i)
// and ((j < docIds.length) implies (docIds[j] >= target))
// and ((j >= 1) implies (docIds[j-1] < target)) )
int j = docIds.length;
while (i < j) {
// ((0 <= i) and (i < j) and (j <= docIds.length)) so (docIds.length > 0)
int m = i + (j - i) / 2; // (i <= m) and (m < j); avoid overflow for (i + j)
if (docIds[m] < target) {
i = m + 1; // (docIds[i-1] < target) and (i <= j)
} else {
j = m; // (docIds[j] >= target) and (i <= j)
}
} // (i == j)
docId = (i >= docIds.length)
? NO_MORE_DOCS // exhausted
: docIds[i++]; // increase i to just after current
return docId;
}
@Override
public long cost() {
return docIds.length;
}
};
return new IntArrayDisi();
}
public void tstEqualDisisNext(DocIdSetIterator disi0, DocIdSetIterator disi1) throws IOException {
assertEquals(disi0.docID(), disi1.docID());
int d0 = disi0.nextDoc();
int d1 = disi1.nextDoc();
int i = 0;
while ((d0 != DocIdSetIterator.NO_MORE_DOCS) && (d1 != DocIdSetIterator.NO_MORE_DOCS)) {
assertEquals("index " + i, d0, d1);
i++;
d0 = disi0.nextDoc();
d1 = disi1.nextDoc();
}
assertEquals("at end", d0, d1);
}
public void tstEqualDisisAdvanceAsNext(DocIdSetIterator disi0, DocIdSetIterator disi1) throws IOException {
assertEquals(disi0.docID(), disi1.docID());
int d0 = disi0.advance(0);
int d1 = disi1.advance(0);
int i = 0;
while ((d0 != DocIdSetIterator.NO_MORE_DOCS) && (d1 != DocIdSetIterator.NO_MORE_DOCS)) {
assertEquals("index " + i, d0, d1);
i++;
d0 = disi0.advance(d1+1);
d1 = disi1.advance(d1+1);
}
assertEquals("at end disi0 " + disi0 + ", disi1 " + disi1, d0, d1);
}
public void tstEF(int[] docIds) {
int maxDoc = -1;
for (int docId: docIds) {
assert docId >= maxDoc; // non decreasing
maxDoc = docId;
}
try {
EliasFanoDocIdSet efd = new EliasFanoDocIdSet(docIds.length, maxDoc);
efd.encodeFromDisi(makeDisi(docIds));
tstEqualDisisNext( makeDisi(docIds), efd.iterator());
tstEqualDisisAdvanceAsNext(makeDisi(docIds), efd.iterator());
} catch (IOException ioe) {
throw new Error(ioe);
}
}
public void testEmpty() { tstEF(new int[] {}); }
public void testOneElementZero() { tstEF(new int[] {0}); }
public void testTwoElements() { tstEF(new int[] {0,1}); }
public void testOneElementOneBit() {
for (int i = 0; i < (Integer.SIZE-1); i++) {
tstEF(new int[] {1 << i});
}
}
public void testIncreasingSequences() {
final int TEST_NUMDOCS = 129;
int[] docIds = new int[TEST_NUMDOCS];
for (int f = 1; f <= 1025; f++) {
for (int i = 0; i < TEST_NUMDOCS; i++) {
docIds[i] = i*f;
}
tstEF(docIds);
}
}
}

View File

@ -0,0 +1,281 @@
package org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
public class TestEliasFanoSequence extends LuceneTestCase {
private static EliasFanoEncoder makeEncoder(long[] values) {
long upperBound = -1L;
for (long value: values) {
assertTrue(value >= upperBound); // test data ok
upperBound = value;
}
EliasFanoEncoder efEncoder = new EliasFanoEncoder(values.length, upperBound);
for (long value: values) {
efEncoder.encodeNext(value);
}
return efEncoder;
}
private static void tstDecodeAllNext(long[] values, EliasFanoDecoder efd) {
efd.toBeforeSequence();
long nextValue = efd.nextValue();
for (long expValue: values) {
assertFalse("nextValue at end too early", EliasFanoDecoder.NO_MORE_VALUES == nextValue);
assertEquals(expValue, nextValue);
nextValue = efd.nextValue();
}
assertEquals(EliasFanoDecoder.NO_MORE_VALUES, nextValue);
}
private static void tstDecodeAllPrev(long[] values, EliasFanoDecoder efd) {
efd.toAfterSequence();
for (int i = values.length - 1; i >= 0; i--) {
long previousValue = efd.previousValue();
assertFalse("previousValue at end too early", EliasFanoDecoder.NO_MORE_VALUES == previousValue);
assertEquals(values[i], previousValue);
}
assertEquals(EliasFanoDecoder.NO_MORE_VALUES, efd.previousValue());
}
private static void tstDecodeAllAdvanceToExpected(long[] values, EliasFanoDecoder efd) {
efd.toBeforeSequence();
long previousValue = -1L;
long index = 0;
for (long expValue: values) {
if (expValue > previousValue) {
long advanceValue = efd.advanceToValue(expValue);
assertFalse("advanceValue at end too early", EliasFanoDecoder.NO_MORE_VALUES == advanceValue);
assertEquals(expValue, advanceValue);
assertEquals(index, efd.index());
previousValue = expValue;
}
index++;
}
long advanceValue = efd.advanceToValue(previousValue+1);
assertEquals(EliasFanoDecoder.NO_MORE_VALUES, advanceValue);
}
private static void tstDecodeAdvanceToMultiples(long[] values, EliasFanoDecoder efd, final long m) {
// test advancing to multiples of m
assert m > 0;
long previousValue = -1L;
long index = 0;
long mm = m;
efd.toBeforeSequence();
for (long expValue: values) {
// mm > previousValue
if (expValue >= mm) {
long advanceValue = efd.advanceToValue(mm);
assertFalse("advanceValue at end too early", EliasFanoDecoder.NO_MORE_VALUES == advanceValue);
assertEquals(expValue, advanceValue);
assertEquals(index, efd.index());
previousValue = expValue;
do {
mm += m;
} while (mm <= previousValue);
}
index++;
}
long advanceValue = efd.advanceToValue(mm);
assertEquals(EliasFanoDecoder.NO_MORE_VALUES, advanceValue);
}
private static void tstDecodeBackToMultiples(long[] values, EliasFanoDecoder efd, final long m) {
// test backing to multiples of m
assert m > 0;
efd.toAfterSequence();
int index = values.length - 1;
if (index < 0) {
long advanceValue = efd.backToValue(0);
assertEquals(EliasFanoDecoder.NO_MORE_VALUES, advanceValue);
return; // empty values, nothing to go back to/from
}
long expValue = values[index];
long previousValue = expValue + 1;
long mm = (expValue / m) * m;
while (index >= 0) {
expValue = values[index];
assert mm < previousValue;
if (expValue <= mm) {
long backValue = efd.backToValue(mm);
assertFalse("backToValue at end too early", EliasFanoDecoder.NO_MORE_VALUES == backValue);
assertEquals(expValue, backValue);
assertEquals(index, efd.index());
previousValue = expValue;
do {
mm -= m;
} while (mm >= previousValue);
}
index--;
}
long backValue = efd.backToValue(mm);
assertEquals(EliasFanoDecoder.NO_MORE_VALUES, backValue);
}
private static void tstEqual(String mes, long[] exp, long[] act) {
assertEquals(mes + ".length", exp.length, act.length);
for (int i = 0; i < exp.length; i++) {
if (exp[i] != act[i]) {
fail(mes + "[" + i + "] " + exp[i] + " != " + act[i]);
}
}
}
private static void tstDecodeAll(EliasFanoEncoder efEncoder, long[] values) {
tstDecodeAllNext(values, efEncoder.getDecoder());
tstDecodeAllPrev(values, efEncoder.getDecoder());
tstDecodeAllAdvanceToExpected(values, efEncoder.getDecoder());
}
private static void tstEFS(long[] values, long[] expHighLongs, long[] expLowLongs) {
EliasFanoEncoder efEncoder = makeEncoder(values);
tstEqual("upperBits", expHighLongs, efEncoder.getUpperBits());
tstEqual("lowerBits", expLowLongs, efEncoder.getLowerBits());
tstDecodeAll(efEncoder, values);
}
private static void tstEFS2(long[] values) {
EliasFanoEncoder efEncoder = makeEncoder(values);
tstDecodeAll(efEncoder, values);
}
private static void tstEFSadvanceToAndBackToMultiples(long[] values, long maxValue, long minAdvanceMultiple) {
EliasFanoEncoder efEncoder = makeEncoder(values);
for (long m = minAdvanceMultiple; m <= maxValue; m += 1) {
tstDecodeAdvanceToMultiples(values, efEncoder.getDecoder(), m);
tstDecodeBackToMultiples(values, efEncoder.getDecoder(), m);
}
}
public void testEmpty() {
long[] values = new long[0];
long[] expHighBits = new long[0];
long[] expLowBits = new long[0];
tstEFS(values, expHighBits, expLowBits);
}
public void testOneValue1() {
long[] values = new long[] {0};
long[] expHighBits = new long[] {0x1L};
long[] expLowBits = new long[] {};
tstEFS(values, expHighBits, expLowBits);
}
public void testTwoValues1() {
long[] values = new long[] {0,0};
long[] expHighBits = new long[] {0x3L};
long[] expLowBits = new long[] {};
tstEFS(values, expHighBits, expLowBits);
}
public void testOneValue2() {
long[] values = new long[] {63};
long[] expHighBits = new long[] {2};
long[] expLowBits = new long[] {31};
tstEFS(values, expHighBits, expLowBits);
}
public void testOneMaxValue() {
long[] values = new long[] {Long.MAX_VALUE};
long[] expHighBits = new long[] {2};
long[] expLowBits = new long[] {Long.MAX_VALUE/2};
tstEFS(values, expHighBits, expLowBits);
}
public void testTwoMinMaxValues() {
long[] values = new long[] {0, Long.MAX_VALUE};
long[] expHighBits = new long[] {0x11};
long[] expLowBits = new long[] {0xE000000000000000L, 0x03FFFFFFFFFFFFFFL};
tstEFS(values, expHighBits, expLowBits);
}
public void testTwoMaxValues() {
long[] values = new long[] {Long.MAX_VALUE, Long.MAX_VALUE};
long[] expHighBits = new long[] {0x18};
long[] expLowBits = new long[] {-1L, 0x03FFFFFFFFFFFFFFL};
tstEFS(values, expHighBits, expLowBits);
}
public void testExample1() { // Figure 1 from Vigna 2012 paper
long[] values = new long[] {5,8,8,15,32};
long[] expLowBits = new long[] {Long.parseLong("0011000001", 2)}; // reverse block and bit order
long[] expHighBits = new long[] {Long.parseLong("1000001011010", 2)}; // reverse block and bit order
tstEFS(values, expHighBits, expLowBits);
}
public void testHashCodeEquals() {
long[] values = new long[] {5,8,8,15,32};
EliasFanoEncoder efEncoder1 = makeEncoder(values);
EliasFanoEncoder efEncoder2 = makeEncoder(values);
assertEquals(efEncoder1, efEncoder2);
assertEquals(efEncoder1.hashCode(), efEncoder2.hashCode());
EliasFanoEncoder efEncoder3 = makeEncoder(new long[] {1,2,3});
assertFalse(efEncoder1.equals(efEncoder3));
assertFalse(efEncoder3.equals(efEncoder1));
assertFalse(efEncoder1.hashCode() == efEncoder3.hashCode()); // implementation ok for these.
}
public void testMonotoneSequences() {
for (int s = 2; s < 1222; s++) {
long[] values = new long[s];
for (int i = 0; i < s; i++) {
values[i] = (i/2);
}
tstEFS2(values);
}
}
public void testStrictMonotoneSequences() {
for (int s = 2; s < 1222; s++) {
long[] values = new long[s];
for (int i = 0; i < s; i++) {
values[i] = i * ((long) i - 1) / 2; // Add a gap of (s-1) to previous
// s = (s*(s+1) - (s-1)*s)/2
}
tstEFS2(values);
}
}
public void testHighBitLongZero() {
final int s = 65;
long[] values = new long[s];
for (int i = 0; i < s-1; i++) {
values[i] = 0;
}
values[s-1] = 128;
long[] expHighBits = new long[] {-1,0,0,1};
long[] expLowBits = new long[0];
tstEFS(values, expHighBits, expLowBits);
}
public void testAdvanceToAndBackToMultiples() {
for (int s = 2; s < 130; s++) {
long[] values = new long[s];
for (int i = 0; i < s; i++) {
values[i] = i * ((long) i + 1) / 2; // Add a gap of s to previous
// s = (s*(s+1) - (s-1)*s)/2
}
tstEFSadvanceToAndBackToMultiples(values, values[s-1], 10);
}
}
}