LUCENE-7485: Better storage of sparse values in Lucene70NormsFormat.

This commit is contained in:
Adrien Grand 2016-10-11 09:20:34 +02:00
parent 63ef459022
commit 2fbbcac580
7 changed files with 504 additions and 212 deletions

View File

@ -0,0 +1,269 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene70;
import java.io.DataInput;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RoaringDocIdSet;
/**
* Disk-based implementation of a {@link DocIdSetIterator} which can return
* the index of the current document, i.e. the ordinal of the current document
* among the list of documents that this iterator can return. This is useful
* to implement sparse doc values by only having to encode values for documents
* that actually have a value.
* <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of
* {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536}
* documents independently and picks between 3 encodings depending on the
* density of the range:<ul>
* <li>{@code ALL} if the range contains 65536 documents exactly,
* <li>{@code DENSE} if the range contains 4096 documents or more; in that
* case documents are stored in a bit set,
* <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
* stored in a {@link DataInput#readShort() short}.
* </ul>
* <p>Only ranges that contain at least one value are encoded.
* <p>This implementation uses 6 bytes per document in the worst-case, which happens
* in the case that all ranges contain exactly one document.
* @lucene.internal
*/
final class IndexedDISI extends DocIdSetIterator {
static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
private static void flush(int block, FixedBitSet buffer, int cardinality, IndexOutput out) throws IOException {
assert block >= 0 && block < 65536;
out.writeShort((short) block);
assert cardinality > 0 && cardinality <= 65536;
out.writeShort((short) (cardinality - 1));
if (cardinality > MAX_ARRAY_LENGTH) {
if (cardinality != 65536) { // all docs are set
for (long word : buffer.getBits()) {
out.writeLong(word);
}
}
} else {
BitSetIterator it = new BitSetIterator(buffer, cardinality);
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
out.writeShort((short) doc);
}
}
}
static void writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
int i = 0;
final FixedBitSet buffer = new FixedBitSet(1<<16);
int prevBlock = -1;
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
final int block = doc >>> 16;
if (prevBlock != -1 && block != prevBlock) {
flush(prevBlock, buffer, i, out);
buffer.clear(0, buffer.length());
prevBlock = block;
i = 0;
}
buffer.set(doc & 0xFFFF);
i++;
prevBlock = block;
}
if (i > 0) {
flush(prevBlock, buffer, i, out);
buffer.clear(0, buffer.length());
}
// NO_MORE_DOCS is stored explicitly
buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, out);
}
/** The slice that stores the {@link DocIdSetIterator}. */
private final IndexInput slice;
private final long cost;
IndexedDISI(IndexInput in, long offset, long length, long cost) throws IOException {
this.slice = in.slice("docs", offset, length);
this.cost = cost;
}
private int block = -1;
private long blockEnd;
private int nextBlockIndex = -1;
Method method;
private int doc = -1;
private int index = -1;
// DENSE variables
private long word;
private int wordIndex = -1;
// number of one bits encountered so far, including those of `word`
private int numberOfOnes;
// ALL variables
private int gap;
@Override
public int docID() {
return doc;
}
@Override
public int advance(int target) throws IOException {
final int targetBlock = target & 0xFFFF0000;
if (block != targetBlock) {
advanceBlock(targetBlock);
}
if (block == targetBlock) {
if (method.advanceWithinBlock(this, target)) {
return doc;
}
readBlockHeader();
}
return doc = method.readFirstDoc(this);
}
private void advanceBlock(int targetBlock) throws IOException {
do {
slice.seek(blockEnd);
readBlockHeader();
} while (block < targetBlock);
}
private void readBlockHeader() throws IOException {
block = Short.toUnsignedInt(slice.readShort()) << 16;
assert block >= 0;
final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
index = nextBlockIndex;
nextBlockIndex = index + numValues;
if (numValues <= MAX_ARRAY_LENGTH) {
method = Method.SPARSE;
blockEnd = slice.getFilePointer() + (numValues << 1);
} else if (numValues == 65536) {
method = Method.ALL;
blockEnd = slice.getFilePointer();
gap = block - index - 1;
} else {
method = Method.DENSE;
blockEnd = slice.getFilePointer() + (1 << 13);
wordIndex = -1;
numberOfOnes = index + 1;
}
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
public int index() {
return index;
}
@Override
public long cost() {
return cost;
}
enum Method {
SPARSE {
@Override
int readFirstDoc(IndexedDISI disi) throws IOException {
disi.index++;
return disi.block | Short.toUnsignedInt(disi.slice.readShort());
}
@Override
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
final int targetInBlock = target & 0xFFFF;
// TODO: binary search
for (; disi.index < disi.nextBlockIndex;) {
int doc = Short.toUnsignedInt(disi.slice.readShort());
disi.index++;
if (doc >= targetInBlock) {
disi.doc = disi.block | doc;
return true;
}
}
return false;
}
},
DENSE {
@Override
int readFirstDoc(IndexedDISI disi) throws IOException {
do {
disi.word = disi.slice.readLong();
disi.wordIndex++;
} while (disi.word == 0L);
disi.index = disi.numberOfOnes;
disi.numberOfOnes += Long.bitCount(disi.word);
return disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
}
@Override
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
final int targetInBlock = target & 0xFFFF;
final int targetWordIndex = targetInBlock >>> 6;
for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
disi.word = disi.slice.readLong();
disi.numberOfOnes += Long.bitCount(disi.word);
}
disi.wordIndex = targetWordIndex;
long leftBits = disi.word >>> target;
if (leftBits != 0L) {
disi.doc = target + Long.numberOfTrailingZeros(leftBits);
disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
return true;
}
while (++disi.wordIndex < 1024) {
disi.word = disi.slice.readLong();
if (disi.word != 0) {
disi.index = disi.numberOfOnes;
disi.numberOfOnes += Long.bitCount(disi.word);
disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
return true;
}
}
return false;
}
},
ALL {
@Override
int readFirstDoc(IndexedDISI disi) {
return disi.block;
}
@Override
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
disi.doc = target;
disi.index = target - disi.gap;
return true;
}
};
/** Read the first document of the current block. */
abstract int readFirstDoc(IndexedDISI disi) throws IOException;
/** Advance to the first doc from the block that is equal to or greater than {@code target}.
* Return true if there is such a doc and false otherwise. */
abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;
}
}

View File

@ -96,12 +96,16 @@ final class Lucene70NormsConsumer extends NormsConsumer {
if (numDocsWithValue == 0) {
meta.writeLong(-2);
meta.writeLong(0L);
} else if (numDocsWithValue == maxDoc) {
meta.writeLong(-1);
meta.writeLong(0L);
} else {
meta.writeLong(data.getFilePointer());
long offset = data.getFilePointer();
meta.writeLong(offset);
values = normsProducer.getNorms(field);
SparseDISI.writeBitSet(values, maxDoc, data);
IndexedDISI.writeBitSet(values, data);
meta.writeLong(data.getFilePointer() - offset);
}
meta.writeInt(numDocsWithValue);

View File

@ -45,9 +45,10 @@ import org.apache.lucene.store.DataOutput;
* <p>Norms metadata (.dvm) --&gt; Header,&lt;Entry&gt;<sup>NumFields</sup>,Footer</p>
* <ul>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>Entry --&gt; FieldNumber, DocsWithFieldAddress, NumDocsWithField, BytesPerNorm, NormsAddress</li>
* <li>Entry --&gt; FieldNumber, DocsWithFieldAddress, DocsWithFieldLength, NumDocsWithField, BytesPerNorm, NormsAddress</li>
* <li>FieldNumber --&gt; {@link DataOutput#writeInt Int32}</li>
* <li>DocsWithFieldAddress --&gt; {@link DataOutput#writeLong Int64}</li>
* <li>DocsWithFieldLength --&gt; {@link DataOutput#writeLong Int64}</li>
* <li>NumDocsWithField --&gt; {@link DataOutput#writeInt Int32}</li>
* <li>BytesPerNorm --&gt; {@link DataOutput#writeByte byte}</li>
* <li>NormsAddress --&gt; {@link DataOutput#writeLong Int64}</li>
@ -60,6 +61,7 @@ import org.apache.lucene.store.DataOutput;
* <p>DocsWithFieldAddress is the pointer to the start of the bit set containing documents that have a norm
* in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm
* value.</p>
* <p>DocsWithFieldLength is the number of bytes used to encode the set of documents that have a norm.</p>
* <li><a name="nvd"></a>
* <p>The Norms data or .nvd file.</p>
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>

View File

@ -90,6 +90,7 @@ final class Lucene70NormsProducer extends NormsProducer {
static class NormsEntry {
byte bytesPerNorm;
long docsWithFieldOffset;
long docsWithFieldLength;
int numDocsWithField;
long normsOffset;
}
@ -108,6 +109,7 @@ final class Lucene70NormsProducer extends NormsProducer {
}
NormsEntry entry = new NormsEntry();
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
entry.numDocsWithField = meta.readInt();
entry.bytesPerNorm = meta.readByte();
switch (entry.bytesPerNorm) {
@ -166,7 +168,7 @@ final class Lucene70NormsProducer extends NormsProducer {
} else {
// sparse
final LongValues normValues = getNormValues(entry);
final SparseDISI disi = new SparseDISI(maxDoc, data, entry.docsWithFieldOffset, entry.numDocsWithField);
final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
return new NumericDocValues() {
@Override

View File

@ -1,114 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene70;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
final class SparseDISI extends DocIdSetIterator {
static void writeBitSet(DocIdSetIterator it, int maxDoc, IndexOutput out) throws IOException {
int currentIndex = 0;
long currentBits = 0;
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
final int index = doc >>> 6;
if (index > currentIndex) {
out.writeLong(currentBits);
for (int i = currentIndex + 1; i < index; ++i) {
out.writeLong(0L);
}
currentIndex = index;
currentBits = 0L;
}
currentBits |= 1L << doc;
}
out.writeLong(currentBits);
final int maxIndex = (maxDoc - 1) >>> 6;
for (int i = currentIndex + 1; i <= maxIndex; ++i) {
out.writeLong(0L);
}
}
final int maxDoc;
final int numWords;
final long cost;
final IndexInput slice;
int doc = -1;
int wordIndex = -1;
long word;
int index = -1;
SparseDISI(int maxDoc, IndexInput in, long offset, long cost) throws IOException {
this.maxDoc = maxDoc;
this.numWords = (int) ((maxDoc + 63L) >>> 6);
this.slice = in.slice("docs", offset, numWords * 8L);
this.cost = cost;
}
@Override
public int advance(int target) throws IOException {
if (target >= maxDoc) {
return doc = NO_MORE_DOCS;
}
final int targetWordIndex = target >>> 6;
for (int i = wordIndex + 1; i <= targetWordIndex; ++i) {
word = slice.readLong();
index += Long.bitCount(word);
}
wordIndex = targetWordIndex;
long leftBits = word >>> target;
if (leftBits != 0L) {
return doc = target + Long.numberOfTrailingZeros(leftBits);
}
while (++wordIndex < numWords) {
word = slice.readLong();
if (word != 0) {
index += Long.bitCount(word);
return doc = (wordIndex << 6) + Long.numberOfTrailingZeros(word);
}
}
return doc = NO_MORE_DOCS;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return cost;
}
public int index() {
return index - Long.bitCount(word >>> doc) + 1;
}
}

View File

@ -0,0 +1,223 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene70;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestIndexedDISI extends LuceneTestCase {
public void testEmpty() throws IOException {
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
FixedBitSet set = new FixedBitSet(maxDoc);
try (Directory dir = newDirectory()) {
doTest(set, dir);
}
}
public void testOneDoc() throws IOException {
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
FixedBitSet set = new FixedBitSet(maxDoc);
set.set(random().nextInt(maxDoc));
try (Directory dir = newDirectory()) {
doTest(set, dir);
}
}
public void testTwoDocs() throws IOException {
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
FixedBitSet set = new FixedBitSet(maxDoc);
set.set(random().nextInt(maxDoc));
set.set(random().nextInt(maxDoc));
try (Directory dir = newDirectory()) {
doTest(set, dir);
}
}
public void testAllDocs() throws IOException {
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
FixedBitSet set = new FixedBitSet(maxDoc);
set.set(1, maxDoc);
try (Directory dir = newDirectory()) {
doTest(set, dir);
}
}
public void testHalfFull() throws IOException {
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
FixedBitSet set = new FixedBitSet(maxDoc);
for (int i = random().nextInt(2); i < maxDoc; i += TestUtil.nextInt(random(), 1, 3)) {
set.set(i);
}
try (Directory dir = newDirectory()) {
doTest(set, dir);
}
}
public void testDocRange() throws IOException {
try (Directory dir = newDirectory()) {
for (int iter = 0; iter < 10; ++iter) {
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
FixedBitSet set = new FixedBitSet(maxDoc);
final int start = random().nextInt(maxDoc);
final int end = TestUtil.nextInt(random(), start + 1, maxDoc);
set.set(start, end);
doTest(set, dir);
}
}
}
public void testSparseDenseBoundary() throws IOException {
try (Directory dir = newDirectory()) {
FixedBitSet set = new FixedBitSet(200000);
int start = 65536 + random().nextInt(100);
// we set MAX_ARRAY_LENGTH bits so the encoding will be sparse
set.set(start, start + IndexedDISI.MAX_ARRAY_LENGTH);
long length;
try (IndexOutput out = dir.createOutput("sparse", IOContext.DEFAULT)) {
IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH), out);
length = out.getFilePointer();
}
try (IndexInput in = dir.openInput("sparse", IOContext.DEFAULT)) {
IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH);
assertEquals(start, disi.nextDoc());
assertEquals(IndexedDISI.Method.SPARSE, disi.method);
}
doTest(set, dir);
// now we set one more bit so the encoding will be dense
set.set(start + IndexedDISI.MAX_ARRAY_LENGTH + random().nextInt(100));
try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH + 1), out);
length = out.getFilePointer();
}
try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH + 1);
assertEquals(start, disi.nextDoc());
assertEquals(IndexedDISI.Method.DENSE, disi.method);
}
doTest(set, dir);
}
}
public void testOneDocMissing() throws IOException {
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
FixedBitSet set = new FixedBitSet(maxDoc);
set.set(0, maxDoc);
set.clear(random().nextInt(maxDoc));
try (Directory dir = newDirectory()) {
doTest(set, dir);
}
}
public void testFewMissingDocs() throws IOException {
try (Directory dir = newDirectory()) {
for (int iter = 0; iter < 100; ++iter) {
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
FixedBitSet set = new FixedBitSet(maxDoc);
set.set(0, maxDoc);
final int numMissingDocs = TestUtil.nextInt(random(), 2, 1000);
for (int i = 0; i < numMissingDocs; ++i) {
set.clear(random().nextInt(maxDoc));
}
doTest(set, dir);
}
}
}
public void testRandom() throws IOException {
try (Directory dir = newDirectory()) {
for (int i = 0; i < 100; ++i) {
doTestRandom(dir);
}
}
}
private void doTestRandom(Directory dir) throws IOException {
List<Integer> docs = new ArrayList<>();
final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 20));
final int numDocs = TestUtil.nextInt(random(), 1, Math.min(100000, Integer.MAX_VALUE / maxStep));
for (int doc = -1, i = 0; i < numDocs; ++i) {
doc += TestUtil.nextInt(random(), 1, maxStep);
docs.add(doc);
}
final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
FixedBitSet set = new FixedBitSet(maxDoc);
for (int doc : docs) {
set.set(doc);
}
doTest(set, dir);
}
private void doTest(FixedBitSet set, Directory dir) throws IOException {
final int cardinality = set.cardinality();
long length;
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out);
length = out.getFilePointer();
}
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
int i = 0;
for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
assertEquals(doc, disi.nextDoc());
assertEquals(i++, disi.index());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
}
for (int step : new int[] {1, 10, 100, 1000, 10000, 100000}) {
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
int index = -1;
while (true) {
int target = disi2.docID() + step;
int doc;
do {
doc = disi2.nextDoc();
index++;
} while (doc < target);
assertEquals(doc, disi.advance(target));
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
assertEquals(index, disi.index());
}
}
}
dir.deleteFile("foo");
}
}

View File

@ -1,94 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene70;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestSparseDISI extends LuceneTestCase {
public void testRandom() throws IOException {
try (Directory dir = newDirectory()) {
for (int i = 0; i < 1000; ++i) {
doTestRandom(dir);
}
}
}
private void doTestRandom(Directory dir) throws IOException {
List<Integer> docs = new ArrayList<>();
final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 10));
final int numDocs = TestUtil.nextInt(random(), 1, 1000);
for (int doc = -1, i = 0; i < numDocs; ++i) {
doc += TestUtil.nextInt(random(), 1, maxStep);
docs.add(doc);
}
final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
FixedBitSet set = new FixedBitSet(maxDoc);
for (int doc : docs) {
set.set(doc);
}
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
SparseDISI.writeBitSet(new BitSetIterator(set, docs.size()), maxDoc, out);
}
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
BitSetIterator disi2 = new BitSetIterator(set, docs.size());
int i = 0;
for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
assertEquals(doc, disi.nextDoc());
assertEquals(i++, disi.index());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
}
for (int step : new int[] {1, 20, maxStep, maxStep * 10}) {
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
BitSetIterator disi2 = new BitSetIterator(set, docs.size());
while (true) {
int target = disi2.docID() + step;
int doc = disi2.advance(target);
assertEquals(doc, disi.advance(target));
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
int index = Collections.binarySearch(docs, doc);
assertEquals(index, disi.index());
}
}
}
dir.deleteFile("foo");
}
}