mirror of https://github.com/apache/lucene.git
LUCENE-7485: Better storage of sparse values in Lucene70NormsFormat.
This commit is contained in:
parent
63ef459022
commit
2fbbcac580
|
@ -0,0 +1,269 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene70;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BitSetIterator;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.RoaringDocIdSet;
|
||||
|
||||
/**
|
||||
* Disk-based implementation of a {@link DocIdSetIterator} which can return
|
||||
* the index of the current document, i.e. the ordinal of the current document
|
||||
* among the list of documents that this iterator can return. This is useful
|
||||
* to implement sparse doc values by only having to encode values for documents
|
||||
* that actually have a value.
|
||||
* <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of
|
||||
* {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536}
|
||||
* documents independently and picks between 3 encodings depending on the
|
||||
* density of the range:<ul>
|
||||
* <li>{@code ALL} if the range contains 65536 documents exactly,
|
||||
* <li>{@code DENSE} if the range contains 4096 documents or more; in that
|
||||
* case documents are stored in a bit set,
|
||||
* <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
|
||||
* stored in a {@link DataInput#readShort() short}.
|
||||
* </ul>
|
||||
* <p>Only ranges that contain at least one value are encoded.
|
||||
* <p>This implementation uses 6 bytes per document in the worst-case, which happens
|
||||
* in the case that all ranges contain exactly one document.
|
||||
* @lucene.internal
|
||||
*/
|
||||
final class IndexedDISI extends DocIdSetIterator {
|
||||
|
||||
static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
|
||||
|
||||
private static void flush(int block, FixedBitSet buffer, int cardinality, IndexOutput out) throws IOException {
|
||||
assert block >= 0 && block < 65536;
|
||||
out.writeShort((short) block);
|
||||
assert cardinality > 0 && cardinality <= 65536;
|
||||
out.writeShort((short) (cardinality - 1));
|
||||
if (cardinality > MAX_ARRAY_LENGTH) {
|
||||
if (cardinality != 65536) { // all docs are set
|
||||
for (long word : buffer.getBits()) {
|
||||
out.writeLong(word);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
BitSetIterator it = new BitSetIterator(buffer, cardinality);
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
out.writeShort((short) doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
|
||||
int i = 0;
|
||||
final FixedBitSet buffer = new FixedBitSet(1<<16);
|
||||
int prevBlock = -1;
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
final int block = doc >>> 16;
|
||||
if (prevBlock != -1 && block != prevBlock) {
|
||||
flush(prevBlock, buffer, i, out);
|
||||
buffer.clear(0, buffer.length());
|
||||
prevBlock = block;
|
||||
i = 0;
|
||||
}
|
||||
buffer.set(doc & 0xFFFF);
|
||||
i++;
|
||||
prevBlock = block;
|
||||
}
|
||||
if (i > 0) {
|
||||
flush(prevBlock, buffer, i, out);
|
||||
buffer.clear(0, buffer.length());
|
||||
}
|
||||
// NO_MORE_DOCS is stored explicitly
|
||||
buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
|
||||
flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, out);
|
||||
}
|
||||
|
||||
/** The slice that stores the {@link DocIdSetIterator}. */
|
||||
private final IndexInput slice;
|
||||
private final long cost;
|
||||
|
||||
IndexedDISI(IndexInput in, long offset, long length, long cost) throws IOException {
|
||||
this.slice = in.slice("docs", offset, length);
|
||||
this.cost = cost;
|
||||
}
|
||||
|
||||
private int block = -1;
|
||||
private long blockEnd;
|
||||
private int nextBlockIndex = -1;
|
||||
Method method;
|
||||
|
||||
private int doc = -1;
|
||||
private int index = -1;
|
||||
|
||||
// DENSE variables
|
||||
private long word;
|
||||
private int wordIndex = -1;
|
||||
// number of one bits encountered so far, including those of `word`
|
||||
private int numberOfOnes;
|
||||
|
||||
// ALL variables
|
||||
private int gap;
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
final int targetBlock = target & 0xFFFF0000;
|
||||
if (block != targetBlock) {
|
||||
advanceBlock(targetBlock);
|
||||
}
|
||||
if (block == targetBlock) {
|
||||
if (method.advanceWithinBlock(this, target)) {
|
||||
return doc;
|
||||
}
|
||||
readBlockHeader();
|
||||
}
|
||||
return doc = method.readFirstDoc(this);
|
||||
}
|
||||
|
||||
private void advanceBlock(int targetBlock) throws IOException {
|
||||
do {
|
||||
slice.seek(blockEnd);
|
||||
readBlockHeader();
|
||||
} while (block < targetBlock);
|
||||
}
|
||||
|
||||
private void readBlockHeader() throws IOException {
|
||||
block = Short.toUnsignedInt(slice.readShort()) << 16;
|
||||
assert block >= 0;
|
||||
final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
|
||||
index = nextBlockIndex;
|
||||
nextBlockIndex = index + numValues;
|
||||
if (numValues <= MAX_ARRAY_LENGTH) {
|
||||
method = Method.SPARSE;
|
||||
blockEnd = slice.getFilePointer() + (numValues << 1);
|
||||
} else if (numValues == 65536) {
|
||||
method = Method.ALL;
|
||||
blockEnd = slice.getFilePointer();
|
||||
gap = block - index - 1;
|
||||
} else {
|
||||
method = Method.DENSE;
|
||||
blockEnd = slice.getFilePointer() + (1 << 13);
|
||||
wordIndex = -1;
|
||||
numberOfOnes = index + 1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return advance(doc + 1);
|
||||
}
|
||||
|
||||
public int index() {
|
||||
return index;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
}
|
||||
|
||||
enum Method {
|
||||
SPARSE {
|
||||
@Override
|
||||
int readFirstDoc(IndexedDISI disi) throws IOException {
|
||||
disi.index++;
|
||||
return disi.block | Short.toUnsignedInt(disi.slice.readShort());
|
||||
}
|
||||
@Override
|
||||
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
|
||||
final int targetInBlock = target & 0xFFFF;
|
||||
// TODO: binary search
|
||||
for (; disi.index < disi.nextBlockIndex;) {
|
||||
int doc = Short.toUnsignedInt(disi.slice.readShort());
|
||||
disi.index++;
|
||||
if (doc >= targetInBlock) {
|
||||
disi.doc = disi.block | doc;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
},
|
||||
DENSE {
|
||||
@Override
|
||||
int readFirstDoc(IndexedDISI disi) throws IOException {
|
||||
do {
|
||||
disi.word = disi.slice.readLong();
|
||||
disi.wordIndex++;
|
||||
} while (disi.word == 0L);
|
||||
disi.index = disi.numberOfOnes;
|
||||
disi.numberOfOnes += Long.bitCount(disi.word);
|
||||
return disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
|
||||
}
|
||||
@Override
|
||||
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
|
||||
final int targetInBlock = target & 0xFFFF;
|
||||
final int targetWordIndex = targetInBlock >>> 6;
|
||||
for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
|
||||
disi.word = disi.slice.readLong();
|
||||
disi.numberOfOnes += Long.bitCount(disi.word);
|
||||
}
|
||||
disi.wordIndex = targetWordIndex;
|
||||
|
||||
long leftBits = disi.word >>> target;
|
||||
if (leftBits != 0L) {
|
||||
disi.doc = target + Long.numberOfTrailingZeros(leftBits);
|
||||
disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
|
||||
return true;
|
||||
}
|
||||
|
||||
while (++disi.wordIndex < 1024) {
|
||||
disi.word = disi.slice.readLong();
|
||||
if (disi.word != 0) {
|
||||
disi.index = disi.numberOfOnes;
|
||||
disi.numberOfOnes += Long.bitCount(disi.word);
|
||||
disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
},
|
||||
ALL {
|
||||
@Override
|
||||
int readFirstDoc(IndexedDISI disi) {
|
||||
return disi.block;
|
||||
}
|
||||
@Override
|
||||
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
|
||||
disi.doc = target;
|
||||
disi.index = target - disi.gap;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/** Read the first document of the current block. */
|
||||
abstract int readFirstDoc(IndexedDISI disi) throws IOException;
|
||||
|
||||
/** Advance to the first doc from the block that is equal to or greater than {@code target}.
|
||||
* Return true if there is such a doc and false otherwise. */
|
||||
abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;
|
||||
}
|
||||
|
||||
}
|
|
@ -96,12 +96,16 @@ final class Lucene70NormsConsumer extends NormsConsumer {
|
|||
|
||||
if (numDocsWithValue == 0) {
|
||||
meta.writeLong(-2);
|
||||
meta.writeLong(0L);
|
||||
} else if (numDocsWithValue == maxDoc) {
|
||||
meta.writeLong(-1);
|
||||
meta.writeLong(0L);
|
||||
} else {
|
||||
meta.writeLong(data.getFilePointer());
|
||||
long offset = data.getFilePointer();
|
||||
meta.writeLong(offset);
|
||||
values = normsProducer.getNorms(field);
|
||||
SparseDISI.writeBitSet(values, maxDoc, data);
|
||||
IndexedDISI.writeBitSet(values, data);
|
||||
meta.writeLong(data.getFilePointer() - offset);
|
||||
}
|
||||
|
||||
meta.writeInt(numDocsWithValue);
|
||||
|
|
|
@ -45,9 +45,10 @@ import org.apache.lucene.store.DataOutput;
|
|||
* <p>Norms metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||
* <li>Entry --> FieldNumber, DocsWithFieldAddress, NumDocsWithField, BytesPerNorm, NormsAddress</li>
|
||||
* <li>Entry --> FieldNumber, DocsWithFieldAddress, DocsWithFieldLength, NumDocsWithField, BytesPerNorm, NormsAddress</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt Int32}</li>
|
||||
* <li>DocsWithFieldAddress --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>DocsWithFieldLength --> {@link DataOutput#writeLong Int64}</li>
|
||||
* <li>NumDocsWithField --> {@link DataOutput#writeInt Int32}</li>
|
||||
* <li>BytesPerNorm --> {@link DataOutput#writeByte byte}</li>
|
||||
* <li>NormsAddress --> {@link DataOutput#writeLong Int64}</li>
|
||||
|
@ -60,6 +61,7 @@ import org.apache.lucene.store.DataOutput;
|
|||
* <p>DocsWithFieldAddress is the pointer to the start of the bit set containing documents that have a norm
|
||||
* in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm
|
||||
* value.</p>
|
||||
* <p>DocsWithFieldLength is the number of bytes used to encode the set of documents that have a norm.</p>
|
||||
* <li><a name="nvd"></a>
|
||||
* <p>The Norms data or .nvd file.</p>
|
||||
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||
|
|
|
@ -90,6 +90,7 @@ final class Lucene70NormsProducer extends NormsProducer {
|
|||
static class NormsEntry {
|
||||
byte bytesPerNorm;
|
||||
long docsWithFieldOffset;
|
||||
long docsWithFieldLength;
|
||||
int numDocsWithField;
|
||||
long normsOffset;
|
||||
}
|
||||
|
@ -108,6 +109,7 @@ final class Lucene70NormsProducer extends NormsProducer {
|
|||
}
|
||||
NormsEntry entry = new NormsEntry();
|
||||
entry.docsWithFieldOffset = meta.readLong();
|
||||
entry.docsWithFieldLength = meta.readLong();
|
||||
entry.numDocsWithField = meta.readInt();
|
||||
entry.bytesPerNorm = meta.readByte();
|
||||
switch (entry.bytesPerNorm) {
|
||||
|
@ -166,7 +168,7 @@ final class Lucene70NormsProducer extends NormsProducer {
|
|||
} else {
|
||||
// sparse
|
||||
final LongValues normValues = getNormValues(entry);
|
||||
final SparseDISI disi = new SparseDISI(maxDoc, data, entry.docsWithFieldOffset, entry.numDocsWithField);
|
||||
final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
|
||||
return new NumericDocValues() {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,114 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene70;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
final class SparseDISI extends DocIdSetIterator {
|
||||
|
||||
static void writeBitSet(DocIdSetIterator it, int maxDoc, IndexOutput out) throws IOException {
|
||||
int currentIndex = 0;
|
||||
long currentBits = 0;
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
final int index = doc >>> 6;
|
||||
if (index > currentIndex) {
|
||||
out.writeLong(currentBits);
|
||||
for (int i = currentIndex + 1; i < index; ++i) {
|
||||
out.writeLong(0L);
|
||||
}
|
||||
currentIndex = index;
|
||||
currentBits = 0L;
|
||||
}
|
||||
currentBits |= 1L << doc;
|
||||
}
|
||||
|
||||
out.writeLong(currentBits);
|
||||
final int maxIndex = (maxDoc - 1) >>> 6;
|
||||
for (int i = currentIndex + 1; i <= maxIndex; ++i) {
|
||||
out.writeLong(0L);
|
||||
}
|
||||
}
|
||||
|
||||
final int maxDoc;
|
||||
final int numWords;
|
||||
final long cost;
|
||||
final IndexInput slice;
|
||||
int doc = -1;
|
||||
int wordIndex = -1;
|
||||
long word;
|
||||
int index = -1;
|
||||
|
||||
SparseDISI(int maxDoc, IndexInput in, long offset, long cost) throws IOException {
|
||||
this.maxDoc = maxDoc;
|
||||
this.numWords = (int) ((maxDoc + 63L) >>> 6);
|
||||
this.slice = in.slice("docs", offset, numWords * 8L);
|
||||
this.cost = cost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
if (target >= maxDoc) {
|
||||
return doc = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
final int targetWordIndex = target >>> 6;
|
||||
for (int i = wordIndex + 1; i <= targetWordIndex; ++i) {
|
||||
word = slice.readLong();
|
||||
index += Long.bitCount(word);
|
||||
}
|
||||
wordIndex = targetWordIndex;
|
||||
|
||||
long leftBits = word >>> target;
|
||||
if (leftBits != 0L) {
|
||||
return doc = target + Long.numberOfTrailingZeros(leftBits);
|
||||
}
|
||||
|
||||
while (++wordIndex < numWords) {
|
||||
word = slice.readLong();
|
||||
if (word != 0) {
|
||||
index += Long.bitCount(word);
|
||||
return doc = (wordIndex << 6) + Long.numberOfTrailingZeros(word);
|
||||
}
|
||||
}
|
||||
|
||||
return doc = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return advance(doc + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
}
|
||||
|
||||
public int index() {
|
||||
return index - Long.bitCount(word >>> doc) + 1;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,223 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene70;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BitSetIterator;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestIndexedDISI extends LuceneTestCase {
|
||||
|
||||
public void testEmpty() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testOneDoc() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(random().nextInt(maxDoc));
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testTwoDocs() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(random().nextInt(maxDoc));
|
||||
set.set(random().nextInt(maxDoc));
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testAllDocs() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(1, maxDoc);
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testHalfFull() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
for (int i = random().nextInt(2); i < maxDoc; i += TestUtil.nextInt(random(), 1, 3)) {
|
||||
set.set(i);
|
||||
}
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testDocRange() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
final int start = random().nextInt(maxDoc);
|
||||
final int end = TestUtil.nextInt(random(), start + 1, maxDoc);
|
||||
set.set(start, end);
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSparseDenseBoundary() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
FixedBitSet set = new FixedBitSet(200000);
|
||||
int start = 65536 + random().nextInt(100);
|
||||
|
||||
// we set MAX_ARRAY_LENGTH bits so the encoding will be sparse
|
||||
set.set(start, start + IndexedDISI.MAX_ARRAY_LENGTH);
|
||||
long length;
|
||||
try (IndexOutput out = dir.createOutput("sparse", IOContext.DEFAULT)) {
|
||||
IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH), out);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
try (IndexInput in = dir.openInput("sparse", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH);
|
||||
assertEquals(start, disi.nextDoc());
|
||||
assertEquals(IndexedDISI.Method.SPARSE, disi.method);
|
||||
}
|
||||
doTest(set, dir);
|
||||
|
||||
// now we set one more bit so the encoding will be dense
|
||||
set.set(start + IndexedDISI.MAX_ARRAY_LENGTH + random().nextInt(100));
|
||||
try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
|
||||
IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH + 1), out);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH + 1);
|
||||
assertEquals(start, disi.nextDoc());
|
||||
assertEquals(IndexedDISI.Method.DENSE, disi.method);
|
||||
}
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testOneDocMissing() throws IOException {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(0, maxDoc);
|
||||
set.clear(random().nextInt(maxDoc));
|
||||
try (Directory dir = newDirectory()) {
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
|
||||
public void testFewMissingDocs() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
for (int iter = 0; iter < 100; ++iter) {
|
||||
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
set.set(0, maxDoc);
|
||||
final int numMissingDocs = TestUtil.nextInt(random(), 2, 1000);
|
||||
for (int i = 0; i < numMissingDocs; ++i) {
|
||||
set.clear(random().nextInt(maxDoc));
|
||||
}
|
||||
doTest(set, dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
doTestRandom(dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestRandom(Directory dir) throws IOException {
|
||||
List<Integer> docs = new ArrayList<>();
|
||||
final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 20));
|
||||
final int numDocs = TestUtil.nextInt(random(), 1, Math.min(100000, Integer.MAX_VALUE / maxStep));
|
||||
for (int doc = -1, i = 0; i < numDocs; ++i) {
|
||||
doc += TestUtil.nextInt(random(), 1, maxStep);
|
||||
docs.add(doc);
|
||||
}
|
||||
final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
|
||||
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
for (int doc : docs) {
|
||||
set.set(doc);
|
||||
}
|
||||
|
||||
doTest(set, dir);
|
||||
}
|
||||
|
||||
private void doTest(FixedBitSet set, Directory dir) throws IOException {
|
||||
final int cardinality = set.cardinality();
|
||||
long length;
|
||||
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||
IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out);
|
||||
length = out.getFilePointer();
|
||||
}
|
||||
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
|
||||
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||
int i = 0;
|
||||
for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
|
||||
assertEquals(doc, disi.nextDoc());
|
||||
assertEquals(i++, disi.index());
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
|
||||
}
|
||||
|
||||
for (int step : new int[] {1, 10, 100, 1000, 10000, 100000}) {
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
|
||||
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||
int index = -1;
|
||||
while (true) {
|
||||
int target = disi2.docID() + step;
|
||||
int doc;
|
||||
do {
|
||||
doc = disi2.nextDoc();
|
||||
index++;
|
||||
} while (doc < target);
|
||||
assertEquals(doc, disi.advance(target));
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
assertEquals(index, disi.index());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dir.deleteFile("foo");
|
||||
}
|
||||
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene70;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BitSetIterator;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestSparseDISI extends LuceneTestCase {
|
||||
|
||||
public void testRandom() throws IOException {
|
||||
try (Directory dir = newDirectory()) {
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
doTestRandom(dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestRandom(Directory dir) throws IOException {
|
||||
List<Integer> docs = new ArrayList<>();
|
||||
final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 10));
|
||||
final int numDocs = TestUtil.nextInt(random(), 1, 1000);
|
||||
for (int doc = -1, i = 0; i < numDocs; ++i) {
|
||||
doc += TestUtil.nextInt(random(), 1, maxStep);
|
||||
docs.add(doc);
|
||||
}
|
||||
final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
|
||||
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
for (int doc : docs) {
|
||||
set.set(doc);
|
||||
}
|
||||
|
||||
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||
SparseDISI.writeBitSet(new BitSetIterator(set, docs.size()), maxDoc, out);
|
||||
}
|
||||
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
|
||||
BitSetIterator disi2 = new BitSetIterator(set, docs.size());
|
||||
int i = 0;
|
||||
for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
|
||||
assertEquals(doc, disi.nextDoc());
|
||||
assertEquals(i++, disi.index());
|
||||
}
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
|
||||
}
|
||||
|
||||
for (int step : new int[] {1, 20, maxStep, maxStep * 10}) {
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
|
||||
BitSetIterator disi2 = new BitSetIterator(set, docs.size());
|
||||
while (true) {
|
||||
int target = disi2.docID() + step;
|
||||
int doc = disi2.advance(target);
|
||||
assertEquals(doc, disi.advance(target));
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
int index = Collections.binarySearch(docs, doc);
|
||||
assertEquals(index, disi.index());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dir.deleteFile("foo");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue