LUCENE-5983: CachingWrapperFilter now uses RoaringDocIdSet.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629605 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2014-10-06 10:28:20 +00:00
parent 1a834a153a
commit bfc1b3deee
8 changed files with 576 additions and 12 deletions

View File

@ -206,6 +206,9 @@ Optimizations
per-segment/per-producer, and norms and doc values merging no longer cause
RAM spikes for latent fields. (Mike McCandless, Robert Muir)
* LUCENE-5983: CachingWrapperFilter now uses a new DocIdSet implementation
called RoaringDocIdSet instead of WAH8DocIdSet. (Adrien Grand)
Build
* LUCENE-5909: Smoke tester now has better command line parsing and

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.RoaringDocIdSet;
import org.apache.lucene.util.WAH8DocIdSet;
/**
@ -86,12 +87,10 @@ public class CachingWrapperFilter extends Filter implements Accountable {
}
/**
* Default cache implementation: uses {@link WAH8DocIdSet}.
* Default cache implementation: uses {@link RoaringDocIdSet}.
*/
protected DocIdSet cacheImpl(DocIdSetIterator iterator, LeafReader reader) throws IOException {
WAH8DocIdSet.Builder builder = new WAH8DocIdSet.Builder();
builder.add(iterator);
return builder.build();
return new RoaringDocIdSet.Builder(reader.maxDoc()).add(iterator).build();
}
// for testing

View File

@ -0,0 +1,124 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/**
* This {@link DocIdSet} encodes the negation of another {@link DocIdSet}.
* It is cacheable and supports random-access if the underlying set is
* cacheable and supports random-access.
* @lucene.internal
*/
public final class NotDocIdSet extends DocIdSet {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(NotDocIdSet.class);
private final int maxDoc;
private final DocIdSet in;
/** Sole constructor. */
public NotDocIdSet(int maxDoc, DocIdSet in) {
this.maxDoc = maxDoc;
this.in = in;
}
@Override
public boolean isCacheable() {
return in.isCacheable();
}
@Override
public Bits bits() throws IOException {
final Bits inBits = in.bits();
if (inBits == null) {
return null;
}
return new Bits() {
@Override
public boolean get(int index) {
return !inBits.get(index);
}
@Override
public int length() {
return inBits.length();
}
};
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + in.ramBytesUsed();
}
@Override
public DocIdSetIterator iterator() throws IOException {
final DocIdSetIterator inIterator = in.iterator();
return new DocIdSetIterator() {
int doc = -1;
int nextSkippedDoc = -1;
@Override
public int nextDoc() throws IOException {
if (doc == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
doc = target;
if (doc > nextSkippedDoc) {
nextSkippedDoc = inIterator.advance(doc);
}
while (true) {
if (doc >= maxDoc) {
return doc = NO_MORE_DOCS;
}
assert doc <= nextSkippedDoc;
if (doc != nextSkippedDoc) {
return doc;
}
doc += 1;
nextSkippedDoc = inIterator.nextDoc();
}
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
// even if there are few docs in this set, iterating over all documents
// costs O(maxDoc) in all cases
return maxDoc;
}
};
}
}

View File

@ -0,0 +1,343 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/**
* {@link DocIdSet} implementation inspired from http://roaringbitmap.org/
*
* The space is divided into blocks of 2^16 bits and each block is encoded
* independently. In each block, if less than 2^12 bits are set, then
* documents are simply stored in a short[]. If more than 2^16-2^12 bits are
* set, then the inverse of the set is encoded in a simple short[]. Otherwise
* a {@link FixedBitSet} is used.
*
* @lucene.internal
*/
public class RoaringDocIdSet extends DocIdSet {
// Number of documents in a block
private static final int BLOCK_SIZE = 1 << 16;
// The maximum length for an array, beyond that point we switch to a bitset
private static final int MAX_ARRAY_LENGTH = 1 << 12;
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(RoaringDocIdSet.class);
/** A builder of {@link RoaringDocIdSet}s. */
public static class Builder {
private final int maxDoc;
private final DocIdSet[] sets;
private int cardinality;
private int lastDocId;
private int currentBlock;
private int currentBlockCardinality;
// We start by filling the buffer and when it's full we copy the content of
// the buffer to the FixedBitSet and put further documents in that bitset
private final short[] buffer;
private FixedBitSet denseBuffer;
/** Sole constructor. */
public Builder(int maxDoc) {
this.maxDoc = maxDoc;
sets = new DocIdSet[(maxDoc + (1 << 16) - 1) >>> 16];
lastDocId = -1;
currentBlock = -1;
buffer = new short[MAX_ARRAY_LENGTH];
}
private void flush() {
assert currentBlockCardinality <= BLOCK_SIZE;
if (currentBlockCardinality <= MAX_ARRAY_LENGTH) {
// Use sparse encoding
assert denseBuffer == null;
if (currentBlockCardinality > 0) {
sets[currentBlock] = new ShortArrayDocIdSet(Arrays.copyOf(buffer, currentBlockCardinality));
}
} else {
assert denseBuffer != null;
assert denseBuffer.cardinality() == currentBlockCardinality;
if (denseBuffer.length() == BLOCK_SIZE && BLOCK_SIZE - currentBlockCardinality < MAX_ARRAY_LENGTH) {
// Doc ids are very dense, inverse the encoding
final short[] excludedDocs = new short[BLOCK_SIZE - currentBlockCardinality];
denseBuffer.flip(0, denseBuffer.length());
int excludedDoc = -1;
for (int i = 0; i < excludedDocs.length; ++i) {
excludedDoc = denseBuffer.nextSetBit(excludedDoc + 1);
assert excludedDoc != -1;
excludedDocs[i] = (short) excludedDoc;
}
assert excludedDoc + 1 == denseBuffer.length() || denseBuffer.nextSetBit(excludedDoc + 1) == -1;
sets[currentBlock] = new NotDocIdSet(BLOCK_SIZE, new ShortArrayDocIdSet(excludedDocs));
} else {
// Neither sparse nor super dense, use a fixed bit set
sets[currentBlock] = denseBuffer;
}
denseBuffer = null;
}
cardinality += currentBlockCardinality;
denseBuffer = null;
currentBlockCardinality = 0;
}
/**
* Add a new doc-id to this builder.
* NOTE: doc ids must be added in order.
*/
public Builder add(int docId) {
if (docId <= lastDocId) {
throw new IllegalArgumentException("Doc ids must be added in-order, got " + docId + " which is <= lastDocID=" + lastDocId);
}
final int block = docId >>> 16;
if (block != currentBlock) {
// we went to a different block, let's flush what we buffered and start from fresh
flush();
currentBlock = block;
}
if (currentBlockCardinality < MAX_ARRAY_LENGTH) {
buffer[currentBlockCardinality] = (short) docId;
} else {
if (denseBuffer == null) {
// the buffer is full, let's move to a fixed bit set
final int numBits = Math.min(1 << 16, maxDoc - (block << 16));
denseBuffer = new FixedBitSet(numBits);
for (short doc : buffer) {
denseBuffer.set(doc & 0xFFFF);
}
}
denseBuffer.set(docId & 0xFFFF);
}
lastDocId = docId;
currentBlockCardinality += 1;
return this;
}
/** Add the content of the provided {@link DocIdSetIterator}. */
public Builder add(DocIdSetIterator disi) throws IOException {
for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) {
add(doc);
}
return this;
}
/** Build an instance. */
public RoaringDocIdSet build() {
flush();
return new RoaringDocIdSet(sets, cardinality);
}
}
/**
* {@link DocIdSet} implementation that can store documents up to 2^16-1 in a short[].
*/
private static class ShortArrayDocIdSet extends DocIdSet {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(ShortArrayDocIdSet.class);
private final short[] docIDs;
private ShortArrayDocIdSet(short[] docIDs) {
this.docIDs = docIDs;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(docIDs);
}
@Override
public DocIdSetIterator iterator() throws IOException {
return new DocIdSetIterator() {
int i = -1; // this is the index of the current document in the array
int doc = -1;
private int docId(int i) {
return docIDs[i] & 0xFFFF;
}
@Override
public int nextDoc() throws IOException {
if (++i >= docIDs.length) {
return doc = NO_MORE_DOCS;
}
return doc = docId(i);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return docIDs.length;
}
@Override
public int advance(int target) throws IOException {
// binary search
int lo = i + 1;
int hi = docIDs.length - 1;
while (lo <= hi) {
final int mid = (lo + hi) >>> 1;
final int midDoc = docId(mid);
if (midDoc < target) {
lo = mid + 1;
} else {
hi = mid - 1;
}
}
if (lo == docIDs.length) {
i = docIDs.length;
return doc = NO_MORE_DOCS;
} else {
i = lo;
return doc = docId(i);
}
}
};
}
}
private final DocIdSet[] docIdSets;
private final int cardinality;
private final long ramBytesUsed;
private RoaringDocIdSet(DocIdSet[] docIdSets, int cardinality) {
this.docIdSets = docIdSets;
long ramBytesUsed = BASE_RAM_BYTES_USED + RamUsageEstimator.shallowSizeOf(docIdSets);
for (DocIdSet set : this.docIdSets) {
if (set != null) {
ramBytesUsed += set.ramBytesUsed();
}
}
this.ramBytesUsed = ramBytesUsed;
this.cardinality = cardinality;
}
@Override
public boolean isCacheable() {
return true;
}
@Override
public long ramBytesUsed() {
return ramBytesUsed;
}
@Override
public DocIdSetIterator iterator() throws IOException {
if (cardinality == 0) {
return null;
}
return new Iterator();
}
private class Iterator extends DocIdSetIterator {
int block;
DocIdSetIterator sub = null;
int doc;
Iterator() throws IOException {
doc = -1;
block = 0;
while (docIdSets[block] == null) {
block += 1;
}
sub = docIdSets[block].iterator();
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
if (doc == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
final int subNext = sub.nextDoc();
if (subNext == NO_MORE_DOCS) {
return firstDocFromNextBlock();
}
return doc = (block << 16) | subNext;
}
@Override
public int advance(int target) throws IOException {
final int targetBlock = target >>> 16;
if (targetBlock != block) {
block = targetBlock;
if (block >= docIdSets.length) {
sub = null;
return doc = NO_MORE_DOCS;
}
if (docIdSets[block] == null) {
return firstDocFromNextBlock();
}
sub = docIdSets[block].iterator();
}
final int subNext = sub.advance(target & 0xFFFF);
if (subNext == NO_MORE_DOCS) {
return firstDocFromNextBlock();
}
return doc = (block << 16) | subNext;
}
private int firstDocFromNextBlock() throws IOException {
while (true) {
block += 1;
if (block >= docIdSets.length) {
sub = null;
return doc = NO_MORE_DOCS;
} else if (docIdSets[block] != null) {
sub = docIdSets[block].iterator();
final int subNext = sub.nextDoc();
assert subNext != NO_MORE_DOCS;
return doc = (block << 16) | subNext;
}
}
}
@Override
public long cost() {
return cardinality;
}
}
/** Return the exact number of documents that are contained in this set. */
public int cardinality() {
return cardinality;
}
}

View File

@ -23,9 +23,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
@ -35,7 +35,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.WAH8DocIdSet;
import org.apache.lucene.util.RoaringDocIdSet;
public class TestCachingWrapperFilter extends LuceneTestCase {
Directory dir;
@ -241,7 +241,7 @@ public class TestCachingWrapperFilter extends LuceneTestCase {
if (originalSet.isCacheable()) {
assertEquals("Cached DocIdSet must be of same class like uncached, if cacheable", originalSet.getClass(), cachedSet.getClass());
} else {
assertTrue("Cached DocIdSet must be an WAH8DocIdSet if the original one was not cacheable", cachedSet instanceof WAH8DocIdSet || cachedSet == null);
assertTrue("Cached DocIdSet must be a RoaringDocIdSet if the original one was not cacheable", cachedSet instanceof RoaringDocIdSet || cachedSet == null);
}
}
}

View File

@ -0,0 +1,54 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.BitSet;
import org.apache.lucene.search.DocIdSet;
public class TestNotDocIdSet extends BaseDocIdSetTestCase<NotDocIdSet> {
@Override
public NotDocIdSet copyOf(BitSet bs, int length) throws IOException {
final FixedBitSet set = new FixedBitSet(length);
for (int doc = bs.nextClearBit(0); doc < length; doc = bs.nextClearBit(doc + 1)) {
set.set(doc);
}
return new NotDocIdSet(length, set);
}
@Override
public void assertEquals(int numBits, BitSet ds1, NotDocIdSet ds2)
throws IOException {
super.assertEquals(numBits, ds1, ds2);
final Bits bits2 = ds2.bits();
assertTrue(ds2.isCacheable()); // since we wrapped a FixedBitSet
assertNotNull(bits2); // since we wrapped a FixedBitSet
assertEquals(numBits, bits2.length());
for (int i = 0; i < numBits; ++i) {
assertEquals(ds1.get(i), bits2.get(i));
}
}
public void testBits() throws IOException {
assertNull(new NotDocIdSet(3, DocIdSet.EMPTY).bits());
assertNotNull(new NotDocIdSet(3, new FixedBitSet(3)).bits());
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.BitSet;
public class TestRoaringDocIdSet extends BaseDocIdSetTestCase<RoaringDocIdSet> {
@Override
public RoaringDocIdSet copyOf(BitSet bs, int length) throws IOException {
final RoaringDocIdSet.Builder builder = new RoaringDocIdSet.Builder(length);
for (int i = bs.nextSetBit(0); i != -1; i = bs.nextSetBit(i + 1)) {
builder.add(i);
}
return builder.build();
}
@Override
public void assertEquals(int numBits, BitSet ds1, RoaringDocIdSet ds2)
throws IOException {
super.assertEquals(numBits, ds1, ds2);
assertEquals(ds1.cardinality(), ds2.cardinality());
}
}

View File

@ -57,8 +57,8 @@ public abstract class BaseDocIdSetTestCase<T extends DocIdSet> extends LuceneTes
/** Test length=0. */
public void testNoBit() throws IOException {
final BitSet bs = new BitSet(1);
final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000));
assertEquals(0, bs, copy);
final T copy = copyOf(bs, 1);
assertEquals(1, bs, copy);
}
/** Test length=1. */
@ -67,7 +67,7 @@ public abstract class BaseDocIdSetTestCase<T extends DocIdSet> extends LuceneTes
if (random().nextBoolean()) {
bs.set(0);
}
final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000));
final T copy = copyOf(bs, 1);
assertEquals(1, bs, copy);
}
@ -80,7 +80,7 @@ public abstract class BaseDocIdSetTestCase<T extends DocIdSet> extends LuceneTes
if (random().nextBoolean()) {
bs.set(1);
}
final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000));
final T copy = copyOf(bs, 2);
assertEquals(2, bs, copy);
}
@ -88,7 +88,7 @@ public abstract class BaseDocIdSetTestCase<T extends DocIdSet> extends LuceneTes
public void testAgainstBitSet() throws IOException {
final int numBits = TestUtil.nextInt(random(), 100, 1 << 20);
// test various random sets with various load factors
for (float percentSet : new float[] {0f, 0.0001f, random().nextFloat() / 2, 0.9f, 1f}) {
for (float percentSet : new float[] {0f, 0.0001f, random().nextFloat(), 0.9f, 1f}) {
final BitSet set = randomSet(numBits, percentSet);
final T copy = copyOf(set, numBits);
assertEquals(numBits, set, copy);