LUCENE-584: Changed Filter API to return a DocIdSet instead of a java.util.BitSet. This allows using more efficient data structures for Filters and makes them more flexible.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@617859 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2008-02-02 19:04:03 +00:00
parent 1caf5cb9ce
commit 18b61286fa
40 changed files with 2857 additions and 175 deletions

View File

@ -16,11 +16,15 @@ API Changes
this was accidentally lost with LUCENE-966. (Nicolas Lalevée via
Mike McCandless)
3. LUCENE-584: Changed Filter API to return a DocIdSet instead of a
java.util.BitSet. This allows using more efficient data structures
for Filters and makes them more flexible. (Paul Elschot, Michael Busch)
Bug fixes
New features
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis
process. The flag is not indexed/stored and is thus only used by analysis.
2. LUCENE-1147: Add -segment option to CheckIndex tool so you can
@ -29,6 +33,12 @@ New features
3. LUCENE-1045: Reopened this issue to add support for short and bytes.
4. LUCENE-584: Added new data structures to o.a.l.util, such as
OpenBitSet and SortedVIntList. These extend DocIdSet and can
directly be used for Filters with the new Filter API. Also changed
the core Filters to use OpenBitSet instead of java.util.BitSet.
(Paul Elschot, Michael Busch)
Optimizations
1. LUCENE-705: When building a compound file, use

View File

@ -37,8 +37,8 @@ public class ChainedFilterTest extends TestCase {
private Query query;
// private DateFilter dateFilter; DateFilter was deprecated and removed
private RangeFilter dateFilter;
private QueryFilter bobFilter;
private QueryFilter sueFilter;
private QueryWrapperFilter bobFilter;
private QueryWrapperFilter sueFilter;
public void setUp() throws Exception {
directory = new RAMDirectory();
@ -74,9 +74,9 @@ public class ChainedFilterTest extends TestCase {
// just treat dates as strings and select the whole range for now...
dateFilter = new RangeFilter("date","","ZZZZ",true,true);
bobFilter = new QueryFilter(
bobFilter = new QueryWrapperFilter(
new TermQuery(new Term("owner", "bob")));
sueFilter = new QueryFilter(
sueFilter = new QueryWrapperFilter(
new TermQuery(new Term("owner", "sue")));
}

View File

@ -8,7 +8,7 @@ import java.util.Map.Entry;
import org.apache.lucene.search.CachingWrapperFilter;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryFilter;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.FilterBuilder;
import org.apache.lucene.xmlparser.FilterBuilderFactory;
@ -105,7 +105,7 @@ public class CachedFilterBuilder implements FilterBuilder {
//cache miss
if (qb != null)
{
cachedFilter = new QueryFilter(q);
cachedFilter = new QueryWrapperFilter(q);
} else
{
cachedFilter = new CachingWrapperFilter(f);

View File

@ -43,11 +43,19 @@ public class CachingSpanFilter extends SpanFilter {
this.filter = filter;
}
/**
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
public BitSet bits(IndexReader reader) throws IOException {
SpanFilterResult result = getCachedResult(reader);
return result != null ? result.getBits() : null;
}
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
SpanFilterResult result = getCachedResult(reader);
return result != null ? result.getDocIdSet() : null;
}
private SpanFilterResult getCachedResult(IndexReader reader) throws IOException {
SpanFilterResult result = null;
if (cache == null) {

View File

@ -43,6 +43,9 @@ public class CachingWrapperFilter extends Filter {
this.filter = filter;
}
/**
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
public BitSet bits(IndexReader reader) throws IOException {
if (cache == null) {
cache = new WeakHashMap();
@ -64,6 +67,28 @@ public class CachingWrapperFilter extends Filter {
return bits;
}
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
if (cache == null) {
cache = new WeakHashMap();
}
synchronized (cache) { // check cache
DocIdSet cached = (DocIdSet) cache.get(reader);
if (cached != null) {
return cached;
}
}
final DocIdSet docIdSet = filter.getDocIdSet(reader);
synchronized (cache) { // update cache
cache.put(reader, docIdSet);
}
return docIdSet;
}
public String toString() {
return "CachingWrapperFilter("+filter+")";
}

View File

@ -85,7 +85,7 @@ public class ConstantScoreQuery extends Query {
public Explanation explain(IndexReader reader, int doc) throws IOException {
ConstantScorer cs = (ConstantScorer)scorer(reader);
boolean exists = cs.bits.get(doc);
boolean exists = cs.docIdSetIterator.skipTo(doc) && (cs.docIdSetIterator.doc() == doc);
ComplexExplanation result = new ComplexExplanation();
@ -107,23 +107,22 @@ public class ConstantScoreQuery extends Query {
}
protected class ConstantScorer extends Scorer {
final BitSet bits;
final DocIdSetIterator docIdSetIterator;
final float theScore;
int doc=-1;
public ConstantScorer(Similarity similarity, IndexReader reader, Weight w) throws IOException {
super(similarity);
theScore = w.getValue();
bits = filter.bits(reader);
docIdSetIterator = filter.getDocIdSet(reader).iterator();
}
public boolean next() throws IOException {
doc = bits.nextSetBit(doc+1);
return doc >= 0;
return docIdSetIterator.next();
}
public int doc() {
return doc;
return docIdSetIterator.doc();
}
public float score() throws IOException {
@ -131,8 +130,7 @@ public class ConstantScoreQuery extends Query {
}
public boolean skipTo(int target) throws IOException {
doc = bits.nextSetBit(target); // requires JDK 1.4
return doc >= 0;
return docIdSetIterator.skipTo(target);
}
public Explanation explain(int doc) throws IOException {
@ -170,3 +168,4 @@ public class ConstantScoreQuery extends Query {
}

View File

@ -0,0 +1,27 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A DocIdSet contains a set of doc ids. Implementing classes must provide
* a {@link DocIdSetIterator} to access the set.
*/
public abstract class DocIdSet {
public abstract DocIdSetIterator iterator();
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* This abstract class defines methods to iterate over a set of
* non-decreasing doc ids.
*/
public abstract class DocIdSetIterator {
/** Returns the current document number. <p> This is invalid until {@link
#next()} is called for the first time.*/
public abstract int doc();
/** Moves to the next docId in the set. Returns true, iff
* there is such a docId. */
public abstract boolean next() throws IOException;
/** Skips entries to the first beyond the current whose document number is
* greater than or equal to <i>target</i>. <p>Returns true iff there is such
* an entry. <p>Behaves as if written: <pre>
* boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* } while (target > doc());
* return true;
* }
* </pre>
* Some implementations are considerably more efficient than that.
*/
public abstract boolean skipTo(int target) throws IOException;
}

View File

@ -20,11 +20,32 @@ package org.apache.lucene.search;
import java.util.BitSet;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.DocIdBitSet;
/** Abstract base class providing a mechanism to restrict searches to a subset
of an index. */
/** Abstract base class providing a mechanism to use a subset of an index
* for restriction or permission of index search results.
* <p>
* <b>Note:</b> In Lucene 3.0 {@link #bits(IndexReader)} will be removed
* and {@link #getDocIdSet(IndexReader)} will be defined as abstract.
* All implementing classes must therefore implement {@link #getDocIdSet(IndexReader)}
* in order to work with Lucene 3.0.
*/
public abstract class Filter implements java.io.Serializable {
/** Returns a BitSet with true for documents which should be permitted in
search results, and false for those that should not. */
public abstract BitSet bits(IndexReader reader) throws IOException;
/**
* @return A BitSet with true for documents which should be permitted in
* search results, and false for those that should not.
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
public BitSet bits(IndexReader reader) throws IOException {
return null;
}
/**
* @return a DocIdSet that provides the documents which should be
* permitted or prohibited in search results.
* @see DocIdBitSet
*/
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
return new DocIdBitSet(bits(reader));
}
}

View File

@ -21,7 +21,6 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.BitSet;
import java.util.Set;
@ -47,7 +46,7 @@ extends Query {
/**
* Constructs a new query which applies a filter to the results of the original query.
* Filter.bits() will be called every time this query is used in a search.
* Filter.getDocIdSet() will be called every time this query is used in a search.
* @param query Query to be filtered, cannot be <code>null</code>.
* @param filter Filter to apply to query results, cannot be <code>null</code>.
*/
@ -86,64 +85,65 @@ extends Query {
inner.addDetail(preBoost);
}
Filter f = FilteredQuery.this.filter;
BitSet matches = f.bits(ir);
if (matches.get(i))
DocIdSetIterator docIdSetIterator = f.getDocIdSet(ir).iterator();
if (docIdSetIterator.skipTo(i) && (docIdSetIterator.doc() == i)) {
return inner;
} else {
Explanation result = new Explanation
(0.0f, "failure to match filter: " + f.toString());
result.addDetail(inner);
return result;
}
}
// return this query
public Query getQuery() { return FilteredQuery.this; }
// return a filtering scorer
public Scorer scorer (IndexReader indexReader) throws IOException {
final Scorer scorer = weight.scorer (indexReader);
final BitSet bitset = filter.bits (indexReader);
return new Scorer (similarity) {
final Scorer scorer = weight.scorer(indexReader);
final DocIdSetIterator docIdSetIterator = filter.getDocIdSet(indexReader).iterator();
public boolean next() throws IOException {
do {
if (! scorer.next()) {
return new Scorer(similarity) {
private boolean advanceToCommon() throws IOException {
while (scorer.doc() != docIdSetIterator.doc()) {
if (scorer.doc() < docIdSetIterator.doc()) {
if (!scorer.skipTo(docIdSetIterator.doc())) {
return false;
}
} while (! bitset.get(scorer.doc()));
/* When skipTo() is allowed on scorer it should be used here
* in combination with bitset.nextSetBit(...)
* See the while loop in skipTo() below.
*/
} else if (!docIdSetIterator.skipTo(scorer.doc())) {
return false;
}
}
return true;
}
public boolean next() throws IOException {
return docIdSetIterator.next() && scorer.next() && advanceToCommon();
}
public int doc() { return scorer.doc(); }
public boolean skipTo(int i) throws IOException {
if (! scorer.skipTo(i)) {
return false;
}
while (! bitset.get(scorer.doc())) {
int nextFiltered = bitset.nextSetBit(scorer.doc() + 1);
if (nextFiltered == -1) {
return false;
} else if (! scorer.skipTo(nextFiltered)) {
return false;
}
}
return true;
return docIdSetIterator.skipTo(i)
&& scorer.skipTo(docIdSetIterator.doc())
&& advanceToCommon();
}
public float score() throws IOException { return getBoost() * scorer.score(); }
// add an explanation about whether the document was filtered
public Explanation explain (int i) throws IOException {
Explanation exp = scorer.explain (i);
exp.setValue(getBoost() * exp.getValue());
Explanation exp = scorer.explain(i);
if (bitset.get(i))
if (docIdSetIterator.skipTo(i) && (docIdSetIterator.doc() == i)) {
exp.setDescription ("allowed by filter: "+exp.getDescription());
else
exp.setValue(getBoost() * exp.getValue());
} else {
exp.setDescription ("removed by filter: "+exp.getDescription());
exp.setValue(0.0f);
}
return exp;
}
};

View File

@ -128,22 +128,33 @@ public class IndexSearcher extends Searcher {
// inherit javadoc
public void search(Weight weight, Filter filter,
final HitCollector results) throws IOException {
HitCollector collector = results;
if (filter != null) {
final BitSet bits = filter.bits(reader);
collector = new HitCollector() {
public final void collect(int doc, float score) {
if (bits.get(doc)) { // skip docs not in bits
results.collect(doc, score);
}
}
};
}
Scorer scorer = weight.scorer(reader);
if (scorer == null)
return;
scorer.score(collector);
if (filter == null) {
scorer.score(results);
return;
}
DocIdSetIterator docIdSetIterator = filter.getDocIdSet(reader).iterator(); // CHECKME: use ConjunctionScorer here?
boolean more = docIdSetIterator.next();
while (more) {
int filterDocId = docIdSetIterator.doc();
if (! scorer.skipTo(filterDocId)) {
more = false;
} else {
int scorerDocId = scorer.doc();
if (scorerDocId == filterDocId) { // permitted by filter
results.collect(scorerDocId, scorer.score());
more = docIdSetIterator.skipTo(scorerDocId + 1);
} else {
more = docIdSetIterator.skipTo(scorerDocId);
}
}
}
}
public Query rewrite(Query original) throws IOException {

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermEnum;
@ -39,6 +40,9 @@ public class PrefixFilter extends Filter {
public Term getPrefix() { return prefix; }
/**
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
public BitSet bits(IndexReader reader) throws IOException {
final BitSet bitSet = new BitSet(reader.maxDoc());
new PrefixGenerator(prefix) {
@ -49,6 +53,16 @@ public class PrefixFilter extends Filter {
return bitSet;
}
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
new PrefixGenerator(prefix) {
public void handleDoc(int doc) {
bitSet.set(doc);
}
}.generate(reader);
return bitSet;
}
/** Prints a user-readable version of this query. */
public String toString () {
StringBuffer buffer = new StringBuffer();
@ -105,3 +119,4 @@ abstract class PrefixGenerator implements IdGenerator {
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.BitSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.OpenBitSet;
/**
* Constrains search results to only match those which also match a provided
@ -44,6 +45,9 @@ public class QueryWrapperFilter extends Filter {
this.query = query;
}
/**
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
public BitSet bits(IndexReader reader) throws IOException {
final BitSet bits = new BitSet(reader.maxDoc());
@ -55,6 +59,17 @@ public class QueryWrapperFilter extends Filter {
return bits;
}
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
new IndexSearcher(reader).search(query, new HitCollector() {
public final void collect(int doc, float score) {
bits.set(doc); // set bit for hit
}
});
return bits;
}
public String toString() {
return "QueryWrapperFilter(" + query + ")";
}

View File

@ -21,6 +21,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.util.OpenBitSet;
import java.io.IOException;
import java.util.BitSet;
@ -94,6 +95,7 @@ public class RangeFilter extends Filter {
* Returns a BitSet with true for documents which should be
* permitted in search results, and false for those that should
* not.
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
public BitSet bits(IndexReader reader) throws IOException {
BitSet bits = new BitSet(reader.maxDoc());
@ -152,6 +154,68 @@ public class RangeFilter extends Filter {
return bits;
}
/**
* Returns a DocIdSet with documents that should be
* permitted in search results.
*/
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
OpenBitSet bits = new OpenBitSet(reader.maxDoc());
TermEnum enumerator =
(null != lowerTerm
? reader.terms(new Term(fieldName, lowerTerm))
: reader.terms(new Term(fieldName,"")));
try {
if (enumerator.term() == null) {
return bits;
}
boolean checkLower = false;
if (!includeLower) // make adjustments to set to exclusive
checkLower = true;
TermDocs termDocs = reader.termDocs();
try {
do {
Term term = enumerator.term();
if (term != null && term.field().equals(fieldName)) {
if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) {
checkLower = false;
if (upperTerm != null) {
int compare = upperTerm.compareTo(term.text());
/* if beyond the upper term, or is exclusive and
* this is equal to the upper term, break out */
if ((compare < 0) ||
(!includeUpper && compare==0)) {
break;
}
}
/* we have a good term, find the docs */
termDocs.seek(enumerator.term());
while (termDocs.next()) {
bits.set(termDocs.doc());
}
}
} else {
break;
}
}
while (enumerator.next());
} finally {
termDocs.close();
}
} finally {
enumerator.close();
}
return bits;
}
public String toString() {
StringBuffer buffer = new StringBuffer();
buffer.append(fieldName);

View File

@ -50,9 +50,21 @@ public class RemoteCachingWrapperFilter extends Filter {
* searcher side of a remote connection.
* @param reader the index reader for the Filter
* @return the bitset
* @deprecated Use {@link #getDocIdSet(IndexReader)} instead.
*/
public BitSet bits(IndexReader reader) throws IOException {
Filter cachedFilter = FilterManager.getInstance().getFilter(filter);
return cachedFilter.bits(reader);
}
/**
* Uses the {@link FilterManager} to keep the cache for a filter on the
* searcher side of a remote connection.
* @param reader the index reader for the Filter
* @return the DocIdSet
*/
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
Filter cachedFilter = FilterManager.getInstance().getFilter(filter);
return cachedFilter.getDocIdSet(reader);
}
}

View File

@ -33,7 +33,7 @@ import java.io.IOException;
* </p>
* @see BooleanQuery#setAllowDocsOutOfOrder
*/
public abstract class Scorer {
public abstract class Scorer extends DocIdSetIterator {
private Similarity similarity;
/** Constructs a Scorer.
@ -76,65 +76,12 @@ public abstract class Scorer {
return true;
}
/**
* Advances to the document matching this Scorer with the lowest doc Id
* greater than the current value of {@link #doc()} (or to the matching
* document with the lowest doc Id if next has never been called on
* this Scorer).
*
* <p>
* When this method is used the {@link #explain(int)} method should not
* be used.
* </p>
*
* @return true iff there is another document matching the query.
* @see BooleanQuery#setAllowDocsOutOfOrder
*/
public abstract boolean next() throws IOException;
/** Returns the current document number matching the query.
* Initially invalid, until {@link #next()} is called the first time.
*/
public abstract int doc();
/** Returns the score of the current document matching the query.
* Initially invalid, until {@link #next()} or {@link #skipTo(int)}
* is called the first time.
*/
public abstract float score() throws IOException;
/**
* Skips to the document matching this Scorer with the lowest doc Id
* greater than or equal to a given target.
*
* <p>
* The behavior of this method is undefined if the target specified is
* less than or equal to the current value of {@link #doc()}.
* <p>
* Behaves as if written:
* <pre>
* boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* } while (target > doc());
* return true;
* }
* </pre>
* Most implementations are considerably more efficient than that.
* </p>
*
* <p>
* When this method is used the {@link #explain(int)} method should not
* be used.
* </p>
*
* @param target The target document number.
* @return true iff there is such a match.
* @see BooleanQuery#setAllowDocsOutOfOrder
*/
public abstract boolean skipTo(int target) throws IOException;
/** Returns an explanation of the score for a document.
* <br>When this method is used, the {@link #next()}, {@link #skipTo(int)} and
* {@link #score(HitCollector)} methods should not be used.

View File

@ -48,7 +48,7 @@ public interface Searchable extends java.rmi.Remote {
* non-high-scoring hits.
*
* @param weight to match documents
* @param filter if non-null, a bitset used to eliminate some documents
* @param filter if non-null, used to permit documents to be collected.
* @param results to receive hits
* @throws BooleanQuery.TooManyClauses
*/

View File

@ -109,7 +109,7 @@ public abstract class Searcher implements Searchable {
* non-high-scoring hits.
*
* @param query to match documents
* @param filter if non-null, a bitset used to eliminate some documents
* @param filter if non-null, used to permit documents to be collected.
* @param results to receive hits
* @throws BooleanQuery.TooManyClauses
*/

View File

@ -30,7 +30,7 @@ import java.io.IOException;
public abstract class SpanFilter extends Filter{
/** Returns a SpanFilterResult with true for documents which should be permitted in
search results, and false for those that should not and Spans for where the true docs match.
* @param reader The {@link org.apache.lucene.index.IndexReader} to load position and bitset information from
* @param reader The {@link org.apache.lucene.index.IndexReader} to load position and DocIdSet information from
* @return A {@link SpanFilterResult}
* @throws java.io.IOException if there was an issue accessing the necessary information
* */

View File

@ -28,19 +28,33 @@ import java.util.List;
*
**/
public class SpanFilterResult {
/** @deprecated */
private BitSet bits;
private DocIdSet docIdSet;
private List positions;//Spans spans;
/**
*
* @param bits The bits for the Filter
* @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects
* @deprecated Use {@link #SpanFilterResult(DocIdSet, List)} instead
*/
public SpanFilterResult(BitSet bits, List positions) {
this.bits = bits;
this.positions = positions;
}
/**
*
* @param docIdSet The DocIdSet for the Filter
* @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects
*/
public SpanFilterResult(DocIdSet docIdSet, List positions) {
this.docIdSet = docIdSet;
this.positions = positions;
}
/**
* The first entry in the array corresponds to the first "on" bit.
* Entries are increasing by document order
@ -50,11 +64,17 @@ public class SpanFilterResult {
return positions;
}
/**
* @deprecated Use {@link #getDocIdSet()}
*/
public BitSet getBits() {
return bits;
}
/** Returns the docIdSet */
public DocIdSet getDocIdSet() {
return docIdSet;
}
public static class PositionInfo {
private int doc;
@ -115,3 +135,4 @@ public class SpanFilterResult {
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.OpenBitSet;
import java.io.IOException;
import java.util.ArrayList;
@ -54,15 +55,14 @@ public class SpanQueryFilter extends SpanFilter {
this.query = query;
}
public BitSet bits(IndexReader reader) throws IOException {
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
SpanFilterResult result = bitSpans(reader);
return result.getBits();
return result.getDocIdSet();
}
public SpanFilterResult bitSpans(IndexReader reader) throws IOException {
final BitSet bits = new BitSet(reader.maxDoc());
final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
Spans spans = query.getSpans(reader);
List tmp = new ArrayList(20);
int currentDoc = -1;

View File

@ -0,0 +1,799 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util; // from org.apache.solr.util rev 555343
/** A variety of high efficiencly bit twiddling routines.
*
* @version $Id$
*/
public class BitUtil {
/** Returns the number of bits set in the long */
public static int pop(long x) {
/* Hacker's Delight 32 bit pop function:
* http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
*
int pop(unsigned x) {
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
x = x + (x >> 8);
x = x + (x >> 16);
return x & 0x0000003F;
}
***/
// 64 bit java version of the C function from above
x = x - ((x >>> 1) & 0x5555555555555555L);
x = (x & 0x3333333333333333L) + ((x >>>2 ) & 0x3333333333333333L);
x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL;
x = x + (x >>> 8);
x = x + (x >>> 16);
x = x + (x >>> 32);
return ((int)x) & 0x7F;
}
/*** Returns the number of set bits in an array of longs. */
public static long pop_array(long A[], int wordOffset, int numWords) {
/*
* Robert Harley and David Seal's bit counting algorithm, as documented
* in the revisions of Hacker's Delight
* http://www.hackersdelight.org/revisions.pdf
* http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
*
* This function was adapted to Java, and extended to use 64 bit words.
* if only we had access to wider registers like SSE from java...
*
* This function can be transformed to compute the popcount of other functions
* on bitsets via something like this:
* sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
*
*/
int n = wordOffset+numWords;
long tot=0, tot8=0;
long ones=0, twos=0, fours=0;
int i;
for (i = wordOffset; i <= n - 8; i+=8) {
/*** C macro from Hacker's Delight
#define CSA(h,l, a,b,c) \
{unsigned u = a ^ b; unsigned v = c; \
h = (a & b) | (u & v); l = u ^ v;}
***/
long twosA,twosB,foursA,foursB,eights;
// CSA(twosA, ones, ones, A[i], A[i+1])
{
long b=A[i], c=A[i+1];
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
// CSA(twosB, ones, ones, A[i+2], A[i+3])
{
long b=A[i+2], c=A[i+3];
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursA, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(twosA, ones, ones, A[i+4], A[i+5])
{
long b=A[i+4], c=A[i+5];
long u=ones^b;
twosA=(ones&b)|(u&c);
ones=u^c;
}
// CSA(twosB, ones, ones, A[i+6], A[i+7])
{
long b=A[i+6], c=A[i+7];
long u=ones^b;
twosB=(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursB, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursB=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(eights, fours, fours, foursA, foursB)
{
long u=fours^foursA;
eights=(fours&foursA)|(u&foursB);
fours=u^foursB;
}
tot8 += pop(eights);
}
// handle trailing words in a binary-search manner...
// derived from the loop above by setting specific elements to 0.
// the original method in Hackers Delight used a simple for loop:
// for (i = i; i < n; i++) // Add in the last elements
// tot = tot + pop(A[i]);
if (i<=n-4) {
long twosA, twosB, foursA, eights;
{
long b=A[i], c=A[i+1];
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
{
long b=A[i+2], c=A[i+3];
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=4;
}
if (i<=n-2) {
long b=A[i], c=A[i+1];
long u=ones ^ b;
long twosA=(ones & b)|( u & c);
ones=u^c;
long foursA=twos&twosA;
twos=twos^twosA;
long eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=2;
}
if (i<n) {
tot += pop(A[i]);
}
tot += (pop(fours)<<2)
+ (pop(twos)<<1)
+ pop(ones)
+ (tot8<<3);
return tot;
}
/** Returns the popcount or cardinality of the two sets after an intersection.
* Neither array is modified.
*/
public static long pop_intersect(long A[], long B[], int wordOffset, int numWords) {
// generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
int n = wordOffset+numWords;
long tot=0, tot8=0;
long ones=0, twos=0, fours=0;
int i;
for (i = wordOffset; i <= n - 8; i+=8) {
long twosA,twosB,foursA,foursB,eights;
// CSA(twosA, ones, ones, (A[i] & B[i]), (A[i+1] & B[i+1]))
{
long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+2] & B[i+2]), (A[i+3] & B[i+3]))
{
long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursA, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(twosA, ones, ones, (A[i+4] & B[i+4]), (A[i+5] & B[i+5]))
{
long b=(A[i+4] & B[i+4]), c=(A[i+5] & B[i+5]);
long u=ones^b;
twosA=(ones&b)|(u&c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+6] & B[i+6]), (A[i+7] & B[i+7]))
{
long b=(A[i+6] & B[i+6]), c=(A[i+7] & B[i+7]);
long u=ones^b;
twosB=(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursB, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursB=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(eights, fours, fours, foursA, foursB)
{
long u=fours^foursA;
eights=(fours&foursA)|(u&foursB);
fours=u^foursB;
}
tot8 += pop(eights);
}
if (i<=n-4) {
long twosA, twosB, foursA, eights;
{
long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
{
long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=4;
}
if (i<=n-2) {
long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
long u=ones ^ b;
long twosA=(ones & b)|( u & c);
ones=u^c;
long foursA=twos&twosA;
twos=twos^twosA;
long eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=2;
}
if (i<n) {
tot += pop((A[i] & B[i]));
}
tot += (pop(fours)<<2)
+ (pop(twos)<<1)
+ pop(ones)
+ (tot8<<3);
return tot;
}
/** Returns the popcount or cardinality of the union of two sets.
* Neither array is modified.
*/
public static long pop_union(long A[], long B[], int wordOffset, int numWords) {
// generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \| B[\1]\)/g'
int n = wordOffset+numWords;
long tot=0, tot8=0;
long ones=0, twos=0, fours=0;
int i;
for (i = wordOffset; i <= n - 8; i+=8) {
/*** C macro from Hacker's Delight
#define CSA(h,l, a,b,c) \
{unsigned u = a ^ b; unsigned v = c; \
h = (a & b) | (u & v); l = u ^ v;}
***/
long twosA,twosB,foursA,foursB,eights;
// CSA(twosA, ones, ones, (A[i] | B[i]), (A[i+1] | B[i+1]))
{
long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+2] | B[i+2]), (A[i+3] | B[i+3]))
{
long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursA, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(twosA, ones, ones, (A[i+4] | B[i+4]), (A[i+5] | B[i+5]))
{
long b=(A[i+4] | B[i+4]), c=(A[i+5] | B[i+5]);
long u=ones^b;
twosA=(ones&b)|(u&c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+6] | B[i+6]), (A[i+7] | B[i+7]))
{
long b=(A[i+6] | B[i+6]), c=(A[i+7] | B[i+7]);
long u=ones^b;
twosB=(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursB, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursB=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(eights, fours, fours, foursA, foursB)
{
long u=fours^foursA;
eights=(fours&foursA)|(u&foursB);
fours=u^foursB;
}
tot8 += pop(eights);
}
if (i<=n-4) {
long twosA, twosB, foursA, eights;
{
long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
{
long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=4;
}
if (i<=n-2) {
long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
long u=ones ^ b;
long twosA=(ones & b)|( u & c);
ones=u^c;
long foursA=twos&twosA;
twos=twos^twosA;
long eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=2;
}
if (i<n) {
tot += pop((A[i] | B[i]));
}
tot += (pop(fours)<<2)
+ (pop(twos)<<1)
+ pop(ones)
+ (tot8<<3);
return tot;
}
/** Returns the popcount or cardinality of A & ~B
* Neither array is modified.
*/
public static long pop_andnot(long A[], long B[], int wordOffset, int numWords) {
// generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& ~B[\1]\)/g'
int n = wordOffset+numWords;
long tot=0, tot8=0;
long ones=0, twos=0, fours=0;
int i;
for (i = wordOffset; i <= n - 8; i+=8) {
/*** C macro from Hacker's Delight
#define CSA(h,l, a,b,c) \
{unsigned u = a ^ b; unsigned v = c; \
h = (a & b) | (u & v); l = u ^ v;}
***/
long twosA,twosB,foursA,foursB,eights;
// CSA(twosA, ones, ones, (A[i] & ~B[i]), (A[i+1] & ~B[i+1]))
{
long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+2] & ~B[i+2]), (A[i+3] & ~B[i+3]))
{
long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursA, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(twosA, ones, ones, (A[i+4] & ~B[i+4]), (A[i+5] & ~B[i+5]))
{
long b=(A[i+4] & ~B[i+4]), c=(A[i+5] & ~B[i+5]);
long u=ones^b;
twosA=(ones&b)|(u&c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+6] & ~B[i+6]), (A[i+7] & ~B[i+7]))
{
long b=(A[i+6] & ~B[i+6]), c=(A[i+7] & ~B[i+7]);
long u=ones^b;
twosB=(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursB, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursB=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(eights, fours, fours, foursA, foursB)
{
long u=fours^foursA;
eights=(fours&foursA)|(u&foursB);
fours=u^foursB;
}
tot8 += pop(eights);
}
if (i<=n-4) {
long twosA, twosB, foursA, eights;
{
long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
{
long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=4;
}
if (i<=n-2) {
long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
long u=ones ^ b;
long twosA=(ones & b)|( u & c);
ones=u^c;
long foursA=twos&twosA;
twos=twos^twosA;
long eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=2;
}
if (i<n) {
tot += pop((A[i] & ~B[i]));
}
tot += (pop(fours)<<2)
+ (pop(twos)<<1)
+ pop(ones)
+ (tot8<<3);
return tot;
}
public static long pop_xor(long A[], long B[], int wordOffset, int numWords) {
int n = wordOffset+numWords;
long tot=0, tot8=0;
long ones=0, twos=0, fours=0;
int i;
for (i = wordOffset; i <= n - 8; i+=8) {
/*** C macro from Hacker's Delight
#define CSA(h,l, a,b,c) \
{unsigned u = a ^ b; unsigned v = c; \
h = (a & b) | (u & v); l = u ^ v;}
***/
long twosA,twosB,foursA,foursB,eights;
// CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i+1] ^ B[i+1]))
{
long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+2] ^ B[i+2]), (A[i+3] ^ B[i+3]))
{
long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursA, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(twosA, ones, ones, (A[i+4] ^ B[i+4]), (A[i+5] ^ B[i+5]))
{
long b=(A[i+4] ^ B[i+4]), c=(A[i+5] ^ B[i+5]);
long u=ones^b;
twosA=(ones&b)|(u&c);
ones=u^c;
}
// CSA(twosB, ones, ones, (A[i+6] ^ B[i+6]), (A[i+7] ^ B[i+7]))
{
long b=(A[i+6] ^ B[i+6]), c=(A[i+7] ^ B[i+7]);
long u=ones^b;
twosB=(ones&b)|(u&c);
ones=u^c;
}
//CSA(foursB, twos, twos, twosA, twosB)
{
long u=twos^twosA;
foursB=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
//CSA(eights, fours, fours, foursA, foursB)
{
long u=fours^foursA;
eights=(fours&foursA)|(u&foursB);
fours=u^foursB;
}
tot8 += pop(eights);
}
if (i<=n-4) {
long twosA, twosB, foursA, eights;
{
long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
long u=ones ^ b;
twosA=(ones & b)|( u & c);
ones=u^c;
}
{
long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
long u=ones^b;
twosB =(ones&b)|(u&c);
ones=u^c;
}
{
long u=twos^twosA;
foursA=(twos&twosA)|(u&twosB);
twos=u^twosB;
}
eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=4;
}
if (i<=n-2) {
long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
long u=ones ^ b;
long twosA=(ones & b)|( u & c);
ones=u^c;
long foursA=twos&twosA;
twos=twos^twosA;
long eights=fours&foursA;
fours=fours^foursA;
tot8 += pop(eights);
i+=2;
}
if (i<n) {
tot += pop((A[i] ^ B[i]));
}
tot += (pop(fours)<<2)
+ (pop(twos)<<1)
+ pop(ones)
+ (tot8<<3);
return tot;
}
/* python code to generate ntzTable
def ntz(val):
if val==0: return 8
i=0
while (val&0x01)==0:
i = i+1
val >>= 1
return i
print ','.join([ str(ntz(i)) for i in range(256) ])
***/
/** table of number of trailing zeros in a byte */
public static final byte[] ntzTable = {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
/** Returns number of trailing zeros in the 64 bit long value. */
public static int ntz(long val) {
// A full binary search to determine the low byte was slower than
// a linear search for nextSetBit(). This is most likely because
// the implementation of nextSetBit() shifts bits to the right, increasing
// the probability that the first non-zero byte is in the rhs.
//
// This implementation does a single binary search at the top level only
// so that all other bit shifting can be done on ints instead of longs to
// remain friendly to 32 bit architectures. In addition, the case of a
// non-zero first byte is checked for first because it is the most common
// in dense bit arrays.
int lower = (int)val;
int lowByte = lower & 0xff;
if (lowByte != 0) return ntzTable[lowByte];
if (lower!=0) {
lowByte = (lower>>>8) & 0xff;
if (lowByte != 0) return ntzTable[lowByte] + 8;
lowByte = (lower>>>16) & 0xff;
if (lowByte != 0) return ntzTable[lowByte] + 16;
// no need to mask off low byte for the last byte in the 32 bit word
// no need to check for zero on the last byte either.
return ntzTable[lower>>>24] + 24;
} else {
// grab upper 32 bits
int upper=(int)(val>>32);
lowByte = upper & 0xff;
if (lowByte != 0) return ntzTable[lowByte] + 32;
lowByte = (upper>>>8) & 0xff;
if (lowByte != 0) return ntzTable[lowByte] + 40;
lowByte = (upper>>>16) & 0xff;
if (lowByte != 0) return ntzTable[lowByte] + 48;
// no need to mask off low byte for the last byte in the 32 bit word
// no need to check for zero on the last byte either.
return ntzTable[upper>>>24] + 56;
}
}
/** returns 0 based index of first set bit
* (only works for x!=0)
* <br/> This is an alternate implementation of ntz()
*/
public static int ntz2(long x) {
int n = 0;
int y = (int)x;
if (y==0) {n+=32; y = (int)(x>>>32); } // the only 64 bit shift necessary
if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
return (ntzTable[ y & 0xff ]) + n;
}
/** returns 0 based index of first set bit
* <br/> This is an alternate implementation of ntz()
*/
public static int ntz3(long x) {
// another implementation taken from Hackers Delight, extended to 64 bits
// and converted to Java.
// Many 32 bit ntz algorithms are at http://www.hackersdelight.org/HDcode/ntz.cc
int n = 1;
// do the first step as a long, all others as ints.
int y = (int)x;
if (y==0) {n+=32; y = (int)(x>>>32); }
if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
if ((y & 0x0000000F) == 0) { n+=4; y>>>=4; }
if ((y & 0x00000003) == 0) { n+=2; y>>>=2; }
return n - (y & 1);
}
/** returns true if v is a power of two or zero*/
public static boolean isPowerOfTwo(int v) {
return ((v & (v-1)) == 0);
}
/** returns true if v is a power of two or zero*/
public static boolean isPowerOfTwo(long v) {
return ((v & (v-1)) == 0);
}
/** returns the next highest power of two, or the current value if it's already a power of two or zero*/
public static int nextHighestPowerOfTwo(int v) {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
/** returns the next highest power of two, or the current value if it's already a power of two or zero*/
public static long nextHighestPowerOfTwo(long v) {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v |= v >> 32;
v++;
return v;
}
}

View File

@ -0,0 +1,77 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.BitSet;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/** Simple DocIdSet and DocIdSetIterator backed by a BitSet */
public class DocIdBitSet extends DocIdSet {
private BitSet bitSet;
public DocIdBitSet(BitSet bitSet) {
this.bitSet = bitSet;
}
public DocIdSetIterator iterator() {
return new DocIdBitSetIterator(bitSet);
}
/**
* Returns the underlying BitSet.
*/
public BitSet getBitSet() {
return this.bitSet;
}
private static class DocIdBitSetIterator extends DocIdSetIterator {
private int docId;
private BitSet bitSet;
DocIdBitSetIterator(BitSet bitSet) {
this.bitSet = bitSet;
this.docId = -1;
}
public int doc() {
assert docId != -1;
return docId;
}
public boolean next() {
// (docId + 1) on next line requires -1 initial value for docNr:
return checkNextDocId(bitSet.nextSetBit(docId + 1));
}
public boolean skipTo(int skipDocNr) {
return checkNextDocId( bitSet.nextSetBit(skipDocNr));
}
private boolean checkNextDocId(int d) {
if (d == -1) { // -1 returned by BitSet.nextSetBit() when exhausted
docId = Integer.MAX_VALUE;
return false;
} else {
docId = d;
return true;
}
}
}
}

View File

@ -0,0 +1,773 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.util.Arrays;
import java.io.Serializable;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/** An "open" BitSet implementation that allows direct access to the array of words
* storing the bits.
* <p/>
* Unlike java.util.bitet, the fact that bits are packed into an array of longs
* is part of the interface. This allows efficient implementation of other algorithms
* by someone other than the author. It also allows one to efficiently implement
* alternate serialization or interchange formats.
* <p/>
* <code>OpenBitSet</code> is faster than <code>java.util.BitSet</code> in most operations
* and *much* faster at calculating cardinality of sets and results of set operations.
* It can also handle sets of larger cardinality (up to 64 * 2**32-1)
* <p/>
* The goals of <code>OpenBitSet</code> are the fastest implementation possible, and
* maximum code reuse. Extra safety and encapsulation
* may always be built on top, but if that's built in, the cost can never be removed (and
* hence people re-implement their own version in order to get better performance).
* If you want a "safe", totally encapsulated (and slower and limited) BitSet
* class, use <code>java.util.BitSet</code>.
* <p/>
* <h3>Performance Results</h3>
*
Test system: Pentium 4, Sun Java 1.5_06 -server -Xbatch -Xmx64M
<br/>BitSet size = 1,000,000
<br/>Results are java.util.BitSet time divided by OpenBitSet time.
<table border="1">
<tr>
<th></th> <th>cardinality</th> <th>intersect_count</th> <th>union</th> <th>nextSetBit</th> <th>get</th> <th>iterator</th>
</tr>
<tr>
<th>50% full</th> <td>3.36</td> <td>3.96</td> <td>1.44</td> <td>1.46</td> <td>1.99</td> <td>1.58</td>
</tr>
<tr>
<th>1% full</th> <td>3.31</td> <td>3.90</td> <td>&nbsp;</td> <td>1.04</td> <td>&nbsp;</td> <td>0.99</td>
</tr>
</table>
<br/>
Test system: AMD Opteron, 64 bit linux, Sun Java 1.5_06 -server -Xbatch -Xmx64M
<br/>BitSet size = 1,000,000
<br/>Results are java.util.BitSet time divided by OpenBitSet time.
<table border="1">
<tr>
<th></th> <th>cardinality</th> <th>intersect_count</th> <th>union</th> <th>nextSetBit</th> <th>get</th> <th>iterator</th>
</tr>
<tr>
<th>50% full</th> <td>2.50</td> <td>3.50</td> <td>1.00</td> <td>1.03</td> <td>1.12</td> <td>1.25</td>
</tr>
<tr>
<th>1% full</th> <td>2.51</td> <td>3.49</td> <td>&nbsp;</td> <td>1.00</td> <td>&nbsp;</td> <td>1.02</td>
</tr>
</table>
* @version $Id$
*/
public class OpenBitSet extends DocIdSet implements Cloneable, Serializable {
protected long[] bits;
protected int wlen; // number of words (elements) used in the array
/** Constructs an OpenBitSet large enough to hold numBits.
*
* @param numBits
*/
public OpenBitSet(long numBits) {
bits = new long[bits2words(numBits)];
wlen = bits.length;
}
public OpenBitSet() {
this(64);
}
/** Constructs an OpenBitSet from an existing long[].
* <br/>
* The first 64 bits are in long[0],
* with bit index 0 at the least significant bit, and bit index 63 at the most significant.
* Given a bit index,
* the word containing it is long[index/64], and it is at bit number index%64 within that word.
* <p>
* numWords are the number of elements in the array that contain
* set bits (non-zero longs).
* numWords should be &lt= bits.length, and
* any existing words in the array at position &gt= numWords should be zero.
*
*/
public OpenBitSet(long[] bits, int numWords) {
this.bits = bits;
this.wlen = numWords;
}
public DocIdSetIterator iterator() {
return new OpenBitSetIterator(bits, wlen);
}
/** Returns the current capacity in bits (1 greater than the index of the last bit) */
public long capacity() { return bits.length << 6; }
/**
* Returns the current capacity of this set. Included for
* compatibility. This is *not* equal to {@link #cardinality}
*/
public long size() {
return capacity();
}
/** Returns true if there are no set bits */
public boolean isEmpty() { return cardinality()==0; }
/** Expert: returns the long[] storing the bits */
public long[] getBits() { return bits; }
/** Expert: sets a new long[] to use as the bit storage */
public void setBits(long[] bits) { this.bits = bits; }
/** Expert: gets the number of longs in the array that are in use */
public int getNumWords() { return wlen; }
/** Expert: sets the number of longs in the array that are in use */
public void setNumWords(int nWords) { this.wlen=nWords; }
/** Returns true or false for the specified bit index. */
public boolean get(int index) {
int i = index >> 6; // div 64
// signed shift will keep a negative index and force an
// array-index-out-of-bounds-exception, removing the need for an explicit check.
if (i>=bits.length) return false;
int bit = index & 0x3f; // mod 64
long bitmask = 1L << bit;
return (bits[i] & bitmask) != 0;
}
/** Returns true or false for the specified bit index.
* The index should be less than the OpenBitSet size
*/
public boolean fastGet(int index) {
int i = index >> 6; // div 64
// signed shift will keep a negative index and force an
// array-index-out-of-bounds-exception, removing the need for an explicit check.
int bit = index & 0x3f; // mod 64
long bitmask = 1L << bit;
return (bits[i] & bitmask) != 0;
}
/** Returns true or false for the specified bit index
* The index should be less than the OpenBitSet size
*/
public boolean get(long index) {
int i = (int)(index >> 6); // div 64
if (i>=bits.length) return false;
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
return (bits[i] & bitmask) != 0;
}
/** Returns true or false for the specified bit index. Allows specifying
* an index outside the current size. */
public boolean fastGet(long index) {
int i = (int)(index >> 6); // div 64
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
return (bits[i] & bitmask) != 0;
}
/*
// alternate implementation of get()
public boolean get1(int index) {
int i = index >> 6; // div 64
int bit = index & 0x3f; // mod 64
return ((bits[i]>>>bit) & 0x01) != 0;
// this does a long shift and a bittest (on x86) vs
// a long shift, and a long AND, (the test for zero is prob a no-op)
// testing on a P4 indicates this is slower than (bits[i] & bitmask) != 0;
}
*/
/** returns 1 if the bit is set, 0 if not.
* The index should be less than the OpenBitSet size
*/
public int getBit(int index) {
int i = index >> 6; // div 64
int bit = index & 0x3f; // mod 64
return ((int)(bits[i]>>>bit)) & 0x01;
}
/*
public boolean get2(int index) {
int word = index >> 6; // div 64
int bit = index & 0x0000003f; // mod 64
return (bits[word] << bit) < 0; // hmmm, this would work if bit order were reversed
// we could right shift and check for parity bit, if it was available to us.
}
*/
/** sets a bit, expanding the set size if necessary */
public void set(long index) {
int wordNum = expandingWordNum(index);
int bit = (int)index & 0x3f;
long bitmask = 1L << bit;
bits[wordNum] |= bitmask;
}
/** Sets the bit at the specified index.
* The index should be less than the OpenBitSet size.
*/
public void fastSet(int index) {
int wordNum = index >> 6; // div 64
int bit = index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] |= bitmask;
}
/** Sets the bit at the specified index.
* The index should be less than the OpenBitSet size.
*/
public void fastSet(long index) {
int wordNum = (int)(index >> 6);
int bit = (int)index & 0x3f;
long bitmask = 1L << bit;
bits[wordNum] |= bitmask;
}
/** Sets a range of bits, expanding the set size if necessary
*
* @param startIndex lower index
* @param endIndex one-past the last bit to set
*/
public void set(long startIndex, long endIndex) {
if (endIndex <= startIndex) return;
int startWord = (int)(startIndex>>6);
// since endIndex is one past the end, this is index of the last
// word to be changed.
int endWord = expandingWordNum(endIndex-1);
long startmask = -1L << startIndex;
long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap
if (startWord == endWord) {
bits[startWord] |= (startmask & endmask);
return;
}
bits[startWord] |= startmask;
Arrays.fill(bits, startWord+1, endWord, -1L);
bits[endWord] |= endmask;
}
protected int expandingWordNum(long index) {
int wordNum = (int)(index >> 6);
if (wordNum>=wlen) {
ensureCapacity(index+1);
wlen = wordNum+1;
}
return wordNum;
}
/** clears a bit.
* The index should be less than the OpenBitSet size.
*/
public void fastClear(int index) {
int wordNum = index >> 6;
int bit = index & 0x03f;
long bitmask = 1L << bit;
bits[wordNum] &= ~bitmask;
// hmmm, it takes one more instruction to clear than it does to set... any
// way to work around this? If there were only 63 bits per word, we could
// use a right shift of 10111111...111 in binary to position the 0 in the
// correct place (using sign extension).
// Could also use Long.rotateRight() or rotateLeft() *if* they were converted
// by the JVM into a native instruction.
// bits[word] &= Long.rotateLeft(0xfffffffe,bit);
}
/** clears a bit.
* The index should be less than the OpenBitSet size.
*/
public void fastClear(long index) {
int wordNum = (int)(index >> 6); // div 64
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] &= ~bitmask;
}
/** clears a bit, allowing access beyond the current set size without changing the size.*/
public void clear(long index) {
int wordNum = (int)(index >> 6); // div 64
if (wordNum>=wlen) return;
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] &= ~bitmask;
}
/** Clears a range of bits. Clearing past the end does not change the size of the set.
*
* @param startIndex lower index
* @param endIndex one-past the last bit to clear
*/
public void clear(long startIndex, long endIndex) {
if (endIndex <= startIndex) return;
int startWord = (int)(startIndex>>6);
if (startWord >= wlen) return;
// since endIndex is one past the end, this is index of the last
// word to be changed.
int endWord = (int)((endIndex-1)>>6);
long startmask = -1L << startIndex;
long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap
// invert masks since we are clearing
startmask = ~startmask;
endmask = ~endmask;
if (startWord == endWord) {
bits[startWord] &= (startmask | endmask);
return;
}
bits[startWord] &= startmask;
int middle = Math.min(wlen, endWord);
Arrays.fill(bits, startWord+1, middle, 0L);
if (endWord < wlen) {
bits[endWord] &= endmask;
}
}
/** Sets a bit and returns the previous value.
* The index should be less than the OpenBitSet size.
*/
public boolean getAndSet(int index) {
int wordNum = index >> 6; // div 64
int bit = index & 0x3f; // mod 64
long bitmask = 1L << bit;
boolean val = (bits[wordNum] & bitmask) != 0;
bits[wordNum] |= bitmask;
return val;
}
/** Sets a bit and returns the previous value.
* The index should be less than the OpenBitSet size.
*/
public boolean getAndSet(long index) {
int wordNum = (int)(index >> 6); // div 64
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
boolean val = (bits[wordNum] & bitmask) != 0;
bits[wordNum] |= bitmask;
return val;
}
/** flips a bit.
* The index should be less than the OpenBitSet size.
*/
public void fastFlip(int index) {
int wordNum = index >> 6; // div 64
int bit = index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] ^= bitmask;
}
/** flips a bit.
* The index should be less than the OpenBitSet size.
*/
public void fastFlip(long index) {
int wordNum = (int)(index >> 6); // div 64
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] ^= bitmask;
}
/** flips a bit, expanding the set size if necessary */
public void flip(long index) {
int wordNum = expandingWordNum(index);
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] ^= bitmask;
}
/** flips a bit and returns the resulting bit value.
* The index should be less than the OpenBitSet size.
*/
public boolean flipAndGet(int index) {
int wordNum = index >> 6; // div 64
int bit = index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] ^= bitmask;
return (bits[wordNum] & bitmask) != 0;
}
/** flips a bit and returns the resulting bit value.
* The index should be less than the OpenBitSet size.
*/
public boolean flipAndGet(long index) {
int wordNum = (int)(index >> 6); // div 64
int bit = (int)index & 0x3f; // mod 64
long bitmask = 1L << bit;
bits[wordNum] ^= bitmask;
return (bits[wordNum] & bitmask) != 0;
}
/** Flips a range of bits, expanding the set size if necessary
*
* @param startIndex lower index
* @param endIndex one-past the last bit to flip
*/
public void flip(long startIndex, long endIndex) {
if (endIndex <= startIndex) return;
int oldlen = wlen;
int startWord = (int)(startIndex>>6);
// since endIndex is one past the end, this is index of the last
// word to be changed.
int endWord = expandingWordNum(endIndex-1);
/*** Grrr, java shifting wraps around so -1L>>>64 == -1
* for that reason, make sure not to use endmask if the bits to flip will
* be zero in the last word (redefine endWord to be the last changed...)
long startmask = -1L << (startIndex & 0x3f); // example: 11111...111000
long endmask = -1L >>> (64-(endIndex & 0x3f)); // example: 00111...111111
***/
long startmask = -1L << startIndex;
long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap
if (startWord == endWord) {
bits[startWord] ^= (startmask & endmask);
return;
}
bits[startWord] ^= startmask;
for (int i=startWord+1; i<endWord; i++) {
bits[i] = ~bits[i];
}
bits[endWord] ^= endmask;
}
/*
public static int pop(long v0, long v1, long v2, long v3) {
// derived from pop_array by setting last four elems to 0.
// exchanges one pop() call for 10 elementary operations
// saving about 7 instructions... is there a better way?
long twosA=v0 & v1;
long ones=v0^v1;
long u2=ones^v2;
long twosB =(ones&v2)|(u2&v3);
ones=u2^v3;
long fours=(twosA&twosB);
long twos=twosA^twosB;
return (pop(fours)<<2)
+ (pop(twos)<<1)
+ pop(ones);
}
*/
/** @return the number of set bits */
public long cardinality() {
return BitUtil.pop_array(bits,0,wlen);
}
/** Returns the popcount or cardinality of the intersection of the two sets.
* Neither set is modified.
*/
public static long intersectionCount(OpenBitSet a, OpenBitSet b) {
return BitUtil.pop_intersect(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
}
/** Returns the popcount or cardinality of the union of the two sets.
* Neither set is modified.
*/
public static long unionCount(OpenBitSet a, OpenBitSet b) {
long tot = BitUtil.pop_union(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
if (a.wlen < b.wlen) {
tot += BitUtil.pop_array(b.bits, a.wlen, b.wlen-a.wlen);
} else if (a.wlen > b.wlen) {
tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen);
}
return tot;
}
/** Returns the popcount or cardinality of "a and not b"
* or "intersection(a, not(b))".
* Neither set is modified.
*/
public static long andNotCount(OpenBitSet a, OpenBitSet b) {
long tot = BitUtil.pop_andnot(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
if (a.wlen > b.wlen) {
tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen);
}
return tot;
}
/** Returns the popcount or cardinality of the exclusive-or of the two sets.
* Neither set is modified.
*/
public static long xorCount(OpenBitSet a, OpenBitSet b) {
long tot = BitUtil.pop_xor(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
if (a.wlen < b.wlen) {
tot += BitUtil.pop_array(b.bits, a.wlen, b.wlen-a.wlen);
} else if (a.wlen > b.wlen) {
tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen);
}
return tot;
}
/** Returns the index of the first set bit starting at the index specified.
* -1 is returned if there are no more set bits.
*/
public int nextSetBit(int index) {
int i = index>>6;
if (i>=wlen) return -1;
int subIndex = index & 0x3f; // index within the word
long word = bits[i] >> subIndex; // skip all the bits to the right of index
if (word!=0) {
return (i<<6) + subIndex + BitUtil.ntz(word);
}
while(++i < wlen) {
word = bits[i];
if (word!=0) return (i<<6) + BitUtil.ntz(word);
}
return -1;
}
/** Returns the index of the first set bit starting at the index specified.
* -1 is returned if there are no more set bits.
*/
public long nextSetBit(long index) {
int i = (int)(index>>>6);
if (i>=wlen) return -1;
int subIndex = (int)index & 0x3f; // index within the word
long word = bits[i] >>> subIndex; // skip all the bits to the right of index
if (word!=0) {
return (((long)i)<<6) + (subIndex + BitUtil.ntz(word));
}
while(++i < wlen) {
word = bits[i];
if (word!=0) return (((long)i)<<6) + BitUtil.ntz(word);
}
return -1;
}
public Object clone() {
try {
OpenBitSet obs = (OpenBitSet)super.clone();
obs.bits = (long[]) obs.bits.clone(); // hopefully an array clone is as fast(er) than arraycopy
return obs;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
/** this = this AND other */
public void intersect(OpenBitSet other) {
int newLen= Math.min(this.wlen,other.wlen);
long[] thisArr = this.bits;
long[] otherArr = other.bits;
// testing against zero can be more efficient
int pos=newLen;
while(--pos>=0) {
thisArr[pos] &= otherArr[pos];
}
if (this.wlen > newLen) {
// fill zeros from the new shorter length to the old length
Arrays.fill(bits,newLen,this.wlen,0);
}
this.wlen = newLen;
}
/** this = this OR other */
public void union(OpenBitSet other) {
int newLen = Math.max(wlen,other.wlen);
ensureCapacityWords(newLen);
long[] thisArr = this.bits;
long[] otherArr = other.bits;
int pos=Math.min(wlen,other.wlen);
while(--pos>=0) {
thisArr[pos] |= otherArr[pos];
}
if (this.wlen < newLen) {
System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen);
}
this.wlen = newLen;
}
/** Remove all elements set in other. this = this AND_NOT other */
public void remove(OpenBitSet other) {
int idx = Math.min(wlen,other.wlen);
long[] thisArr = this.bits;
long[] otherArr = other.bits;
while(--idx>=0) {
thisArr[idx] &= ~otherArr[idx];
}
}
/** this = this XOR other */
public void xor(OpenBitSet other) {
int newLen = Math.max(wlen,other.wlen);
ensureCapacityWords(newLen);
long[] thisArr = this.bits;
long[] otherArr = other.bits;
int pos=Math.min(wlen,other.wlen);
while(--pos>=0) {
thisArr[pos] ^= otherArr[pos];
}
if (this.wlen < newLen) {
System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen);
}
this.wlen = newLen;
}
// some BitSet compatability methods
//** see {@link intersect} */
public void and(OpenBitSet other) {
intersect(other);
}
//** see {@link union} */
public void or(OpenBitSet other) {
union(other);
}
//** see {@link andNot} */
public void andNot(OpenBitSet other) {
remove(other);
}
/** returns true if the sets have any elements in common */
public boolean intersects(OpenBitSet other) {
int pos = Math.min(this.wlen, other.wlen);
long[] thisArr = this.bits;
long[] otherArr = other.bits;
while (--pos>=0) {
if ((thisArr[pos] & otherArr[pos])!=0) return true;
}
return false;
}
/** Expand the long[] with the size given as a number of words (64 bit longs).
* getNumWords() is unchanged by this call.
*/
public void ensureCapacityWords(int numWords) {
if (bits.length < numWords) {
long[] newBits = new long[numWords];
System.arraycopy(bits,0,newBits,0,wlen);
bits = newBits;
}
}
/** Ensure that the long[] is big enough to hold numBits, expanding it if necessary.
* getNumWords() is unchanged by this call.
*/
public void ensureCapacity(long numBits) {
ensureCapacityWords(bits2words(numBits));
}
/** Lowers numWords, the number of words in use,
* by checking for trailing zero words.
*/
public void trimTrailingZeros() {
int idx = wlen-1;
while (idx>=0 && bits[idx]==0) idx--;
wlen = idx+1;
}
/** returns the number of 64 bit words it would take to hold numBits */
public static int bits2words(long numBits) {
return (int)(((numBits-1)>>>6)+1);
}
/** returns true if both sets have the same bits set */
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof OpenBitSet)) return false;
OpenBitSet a;
OpenBitSet b = (OpenBitSet)o;
// make a the larger set.
if (b.wlen > this.wlen) {
a = b; b=this;
} else {
a=this;
}
// check for any set bits out of the range of b
for (int i=a.wlen-1; i>=b.wlen; i--) {
if (a.bits[i]!=0) return false;
}
for (int i=b.wlen-1; i>=0; i--) {
if (a.bits[i] != b.bits[i]) return false;
}
return true;
}
public int hashCode() {
long h = 0x98761234; // something non-zero for length==0
for (int i = bits.length; --i>=0;) {
h ^= bits[i];
h = (h << 1) | (h >>> 31); // rotate left
}
return (int)((h>>32) ^ h); // fold leftmost bits into right
}
}

View File

@ -0,0 +1,173 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
/** An iterator to iterate over set bits in an OpenBitSet.
* This is faster than nextSetBit() for iterating over the complete set of bits,
* especially when the density of the bits set is high.
*
* @version $Id$
*/
public class OpenBitSetIterator extends DocIdSetIterator {
// The General Idea: instead of having an array per byte that has
// the offsets of the next set bit, that array could be
// packed inside a 32 bit integer (8 4 bit numbers). That
// should be faster than accessing an array for each index, and
// the total array size is kept smaller (256*sizeof(int))=1K
protected final static int[] bitlist={
0x0,0x1,0x2,0x21,0x3,0x31,0x32,0x321,0x4,0x41,0x42,0x421,0x43,0x431,0x432,0x4321,0x5,0x51,0x52,0x521,0x53,0x531,0x532,0x5321,0x54,0x541,0x542,0x5421,0x543,0x5431,0x5432,0x54321,0x6,0x61,0x62,0x621,0x63,0x631,0x632,0x6321,0x64,0x641,0x642,0x6421,0x643,0x6431,0x6432,0x64321,0x65,0x651,0x652,0x6521,0x653,0x6531,0x6532,0x65321,0x654,0x6541,0x6542,0x65421,0x6543,0x65431,0x65432,0x654321,0x7,0x71,0x72,0x721,0x73,0x731,0x732,0x7321,0x74,0x741,0x742,0x7421,0x743,0x7431,0x7432,0x74321,0x75,0x751,0x752,0x7521,0x753,0x7531,0x7532,0x75321,0x754,0x7541,0x7542,0x75421,0x7543,0x75431,0x75432,0x754321,0x76,0x761,0x762,0x7621,0x763,0x7631,0x7632,0x76321,0x764,0x7641,0x7642,0x76421,0x7643,0x76431,0x76432,0x764321,0x765,0x7651,0x7652,0x76521,0x7653,0x76531,0x76532,0x765321,0x7654,0x76541,0x76542,0x765421,0x76543,0x765431,0x765432,0x7654321,0x8,0x81,0x82,0x821,0x83,0x831,0x832,0x8321,0x84,0x841,0x842,0x8421,0x843,0x8431,0x8432,0x84321,0x85,0x851,0x852,0x8521,0x853,0x8531,0x8532,0x85321,0x854,0x8541,0x8542,0x85421,0x8543,0x85431,0x85432,0x854321,0x86,0x861,0x862,0x8621,0x863,0x8631,0x8632,0x86321,0x864,0x8641,0x8642,0x86421,0x8643,0x86431,0x86432,0x864321,0x865,0x8651,0x8652,0x86521,0x8653,0x86531,0x86532,0x865321,0x8654,0x86541,0x86542,0x865421,0x86543,0x865431,0x865432,0x8654321,0x87,0x871,0x872,0x8721,0x873,0x8731,0x8732,0x87321,0x874,0x8741,0x8742,0x87421,0x8743,0x87431,0x87432,0x874321,0x875,0x8751,0x8752,0x87521,0x8753,0x87531,0x87532,0x875321,0x8754,0x87541,0x87542,0x875421,0x87543,0x875431,0x875432,0x8754321,0x876,0x8761,0x8762,0x87621,0x8763,0x87631,0x87632,0x876321,0x8764,0x87641,0x87642,0x876421,0x87643,0x876431,0x876432,0x8764321,0x8765,0x87651,0x87652,0x876521,0x87653,0x876531,0x876532,0x8765321,0x87654,0x876541,0x876542,0x8765421,0x876543,0x8765431,0x8765432,0x87654321
};
/***** the python code that generated bitlist
def bits2int(val):
arr=0
for shift in range(8,0,-1):
if val & 0x80:
arr = (arr << 4) | shift
val = val << 1
return arr
def int_table():
tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
return ','.join(tbl)
******/
// hmmm, what about an iterator that finds zeros though,
// or a reverse iterator... should they be separate classes
// for efficiency, or have a common root interface? (or
// maybe both? could ask for a SetBitsIterator, etc...
private final long[] arr;
private final int words;
private int i=-1;
private long word;
private int wordShift;
private int indexArray;
private int curDocId;
public OpenBitSetIterator(OpenBitSet obs) {
this(obs.getBits(), obs.getNumWords());
}
public OpenBitSetIterator(long[] bits, int numWords) {
arr = bits;
words = numWords;
}
// 64 bit shifts
private void shift() {
if ((int)word ==0) {wordShift +=32; word = word >>>32; }
if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; }
if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; }
indexArray = bitlist[(int)word & 0xff];
}
/***** alternate shift implementations
// 32 bit shifts, but a long shift needed at the end
private void shift2() {
int y = (int)word;
if (y==0) {wordShift +=32; y = (int)(word >>>32); }
if ((y & 0x0000FFFF) == 0) { wordShift +=16; y>>>=16; }
if ((y & 0x000000FF) == 0) { wordShift +=8; y>>>=8; }
indexArray = bitlist[y & 0xff];
word >>>= (wordShift +1);
}
private void shift3() {
int lower = (int)word;
int lowByte = lower & 0xff;
if (lowByte != 0) {
indexArray=bitlist[lowByte];
return;
}
shift();
}
******/
public boolean next() {
if (indexArray==0) {
if (word!=0) {
word >>>= 8;
wordShift += 8;
}
while (word==0) {
if (++i >= words) {
curDocId = -1;
return false;
}
word = arr[i];
wordShift =-1; // loop invariant code motion should move this
}
// after the first time, should I go with a linear search, or
// stick with the binary search in shift?
shift();
}
int bitIndex = (indexArray & 0x0f) + wordShift;
indexArray >>>= 4;
// should i<<6 be cached as a separate variable?
// it would only save one cycle in the best circumstances.
curDocId = (i<<6) + bitIndex;
return true;
}
public boolean skipTo(int target) {
indexArray=0;
i = target >> 6;
if (i>=words) {
word =0; // setup so next() will also return -1
curDocId = -1;
return false;
}
wordShift = target & 0x3f;
word = arr[i] >>> wordShift;
if (word !=0) {
wordShift--; // compensate for 1 based arrIndex
} else {
while (word ==0) {
if (++i >= words) {
curDocId = -1;
return false;
}
word = arr[i];
}
wordShift =-1;
}
shift();
int bitIndex = (indexArray & 0x0f) + wordShift;
indexArray >>>= 4;
// should i<<6 be cached as a separate variable?
// it would only save one cycle in the best circumstances.
curDocId = (i<<6) + bitIndex;
return true;
}
public int doc() {
return this.curDocId;
}
}

View File

@ -0,0 +1,218 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.BitSet;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/**
* Store and iterate sorted integers in compressed form in RAM.
* <br>The code for compressing the differences between ascending integers was
* borrowed from {@link org.apache.lucene.store.IndexInput} and
* {@link org.apache.lucene.store.IndexOutput}.
*/
public class SortedVIntList extends DocIdSet {
/** When a BitSet has fewer than 1 in BITS2VINTLIST_SIZE bits set,
* a SortedVIntList representing the index numbers of the set bits
* will be smaller than that BitSet.
*/
final static int BITS2VINTLIST_SIZE = 8;
private int size;
private byte[] bytes;
private int lastBytePos;
/**
* Create a SortedVIntList from all elements of an array of integers.
*
* @param sortedInts A sorted array of non negative integers.
*/
public SortedVIntList(int[] sortedInts) {
this(sortedInts, sortedInts.length);
}
/**
* Create a SortedVIntList from an array of integers.
* @param sortedInts An array of sorted non negative integers.
* @param inputSize The number of integers to be used from the array.
*/
public SortedVIntList(int[] sortedInts, int inputSize) {
SortedVIntListBuilder builder = new SortedVIntListBuilder();
for (int i = 0; i < inputSize; i++) {
builder.addInt(sortedInts[i]);
}
builder.done();
}
/**
* Create a SortedVIntList from a BitSet.
* @param bits A bit set representing a set of integers.
*/
public SortedVIntList(BitSet bits) {
SortedVIntListBuilder builder = new SortedVIntListBuilder();
int nextInt = bits.nextSetBit(0);
while (nextInt != -1) {
builder.addInt(nextInt);
nextInt = bits.nextSetBit(nextInt + 1);
}
builder.done();
}
/**
* Create a SortedVIntList from an OpenBitSet.
* @param bits A bit set representing a set of integers.
*/
public SortedVIntList(OpenBitSet bits) {
SortedVIntListBuilder builder = new SortedVIntListBuilder();
int nextInt = bits.nextSetBit(0);
while (nextInt != -1) {
builder.addInt(nextInt);
nextInt = bits.nextSetBit(nextInt + 1);
}
builder.done();
}
/**
* Create a SortedVIntList.
* @param docIdSetIterator An iterator providing document numbers as a set of integers.
* This DocIdSetIterator is iterated completely when this constructor
* is called and it must provide the integers in non
* decreasing order.
*/
public SortedVIntList(DocIdSetIterator docIdSetIterator) throws IOException {
SortedVIntListBuilder builder = new SortedVIntListBuilder();
while (docIdSetIterator.next()) {
builder.addInt(docIdSetIterator.doc());
}
builder.done();
}
private class SortedVIntListBuilder {
private int lastInt = 0;
SortedVIntListBuilder() {
initBytes();
lastInt = 0;
}
void addInt(int nextInt) {
int diff = nextInt - lastInt;
if (diff < 0) {
throw new IllegalArgumentException(
"Input not sorted or first element negative.");
}
if ((lastBytePos + MAX_BYTES_PER_INT) > bytes.length) {
// biggest possible int does not fit
resizeBytes((bytes.length * 2) + MAX_BYTES_PER_INT);
}
// See org.apache.lucene.store.IndexOutput.writeVInt()
while ((diff & ~VB1) != 0) { // The high bit of the next byte needs to be set.
bytes[lastBytePos++] = (byte) ((diff & VB1) | ~VB1);
diff >>>= BIT_SHIFT;
}
bytes[lastBytePos++] = (byte) diff; // Last byte, high bit not set.
size++;
lastInt = nextInt;
}
void done() {
resizeBytes(lastBytePos);
}
}
private void initBytes() {
size = 0;
bytes = new byte[128]; // initial byte size
lastBytePos = 0;
}
private void resizeBytes(int newSize) {
if (newSize != bytes.length) {
byte[] newBytes = new byte[newSize];
System.arraycopy(bytes, 0, newBytes, 0, lastBytePos);
bytes = newBytes;
}
}
private static final int VB1 = 0x7F;
private static final int BIT_SHIFT = 7;
private final int MAX_BYTES_PER_INT = (31 / BIT_SHIFT) + 1;
/**
* @return The total number of sorted integers.
*/
public int size() {
return size;
}
/**
* @return The size of the byte array storing the compressed sorted integers.
*/
public int getByteSize() {
return bytes.length;
}
/**
* @return An iterator over the sorted integers.
*/
public DocIdSetIterator iterator() {
return new DocIdSetIterator() {
int bytePos = 0;
int lastInt = 0;
private void advance() {
// See org.apache.lucene.store.IndexInput.readVInt()
byte b = bytes[bytePos++];
lastInt += b & VB1;
for (int s = BIT_SHIFT; (b & ~VB1) != 0; s += BIT_SHIFT) {
b = bytes[bytePos++];
lastInt += (b & VB1) << s;
}
}
public int doc() {return lastInt;}
public boolean next() {
if (bytePos >= lastBytePos) {
return false;
} else {
advance();
return true;
}
}
public boolean skipTo(int docNr) {
while (bytePos < lastBytePos) {
advance();
if (lastInt >= docNr) { // No skipping to docNr available.
return true;
}
}
return false;
}
};
}
}

View File

@ -43,13 +43,13 @@ public class CachingWrapperFilterHelper extends CachingWrapperFilter {
this.shouldHaveCache = shouldHaveCache;
}
public BitSet bits(IndexReader reader) throws IOException {
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
if (cache == null) {
cache = new WeakHashMap();
}
synchronized (cache) { // check cache
BitSet cached = (BitSet) cache.get(reader);
DocIdSet cached = (DocIdSet) cache.get(reader);
if (shouldHaveCache) {
TestCase.assertNotNull("Cache should have data ", cached);
} else {
@ -60,7 +60,7 @@ public class CachingWrapperFilterHelper extends CachingWrapperFilter {
}
}
final BitSet bits = filter.bits(reader);
final DocIdSet bits = filter.getDocIdSet(reader);
synchronized (cache) { // update cache
cache.put(reader, bits);

View File

@ -18,14 +18,15 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.DocIdBitSet;
import java.util.BitSet;
public class MockFilter extends Filter {
private boolean wasCalled;
public BitSet bits(IndexReader reader) {
public DocIdSet getDocIdSet(IndexReader reader) {
wasCalled = true;
return new BitSet();
return new DocIdBitSet(new BitSet());
}
public void clear() {

View File

@ -42,7 +42,7 @@ public class RemoteCachingWrapperFilterHelper extends RemoteCachingWrapperFilter
this.shouldHaveCache = shouldHaveCache;
}
public BitSet bits(IndexReader reader) throws IOException {
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
Filter cachedFilter = FilterManager.getInstance().getFilter(filter);
TestCase.assertNotNull("Filter should not be null", cachedFilter);
@ -55,6 +55,6 @@ public class RemoteCachingWrapperFilterHelper extends RemoteCachingWrapperFilter
if (filter instanceof CachingWrapperFilterHelper) {
((CachingWrapperFilterHelper)cachedFilter).setShouldHaveCache(shouldHaveCache);
}
return cachedFilter.bits(reader);
return cachedFilter.getDocIdSet(reader);
}
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.DocIdBitSet;
import java.util.BitSet;
import java.io.IOException;
@ -29,9 +30,9 @@ public class SingleDocTestFilter extends Filter {
this.doc = doc;
}
public BitSet bits(IndexReader reader) throws IOException {
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
BitSet bits = new BitSet(reader.maxDoc());
bits.set(doc);
return bits;
return new DocIdBitSet(bits);
}
}

View File

@ -36,12 +36,12 @@ public class TestCachingWrapperFilter extends LuceneTestCase {
CachingWrapperFilter cacher = new CachingWrapperFilter(filter);
// first time, nested filter is called
cacher.bits(reader);
cacher.getDocIdSet(reader);
assertTrue("first time", filter.wasCalled());
// second time, nested filter should not be called
filter.clear();
cacher.bits(reader);
cacher.getDocIdSet(reader);
assertFalse("second time", filter.wasCalled());
reader.close();

View File

@ -33,6 +33,7 @@ import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.DocIdBitSet;
import java.util.Random;
import java.util.BitSet;
@ -122,12 +123,12 @@ public class TestExplanations extends LuceneTestCase {
public ItemizedFilter(int[] docs) {
this.docs = docs;
}
public BitSet bits(IndexReader r) {
public DocIdSet getDocIdSet(IndexReader r) {
BitSet b = new BitSet(r.maxDoc());
for (int i = 0; i < docs.length; i++) {
b.set(docs[i]);
}
return b;
return new DocIdBitSet(b);
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.DocIdBitSet;
import java.util.BitSet;
@ -82,11 +83,11 @@ extends LuceneTestCase {
// must be static for serialization tests
private static Filter newStaticFilterB() {
return new Filter() {
public BitSet bits (IndexReader reader) {
public DocIdSet getDocIdSet (IndexReader reader) {
BitSet bitset = new BitSet(5);
bitset.set (1);
bitset.set (3);
return bitset;
return new DocIdBitSet(bitset);
}
};
}
@ -150,10 +151,10 @@ extends LuceneTestCase {
// must be static for serialization tests
private static Filter newStaticFilterA() {
return new Filter() {
public BitSet bits (IndexReader reader) {
public DocIdSet getDocIdSet (IndexReader reader) {
BitSet bitset = new BitSet(5);
bitset.set(0, 5);
return bitset;
return new DocIdBitSet(bitset);
}
};
}
@ -200,3 +201,4 @@ extends LuceneTestCase {
}

View File

@ -91,7 +91,7 @@ public class TestRemoteCachingWrapperFilter extends LuceneTestCase {
public void testTermRemoteFilter() throws Exception {
CachingWrapperFilterHelper cwfh = new CachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "a"))));
CachingWrapperFilterHelper cwfh = new CachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "a"))));
// This is what we are fixing - if one uses a CachingWrapperFilter(Helper) it will never
// cache the filter on the remote site
@ -112,16 +112,16 @@ public class TestRemoteCachingWrapperFilter extends LuceneTestCase {
// assert that we get the same cached Filter, even if we create a new instance of RemoteCachingWrapperFilter(Helper)
// this should pass because the Filter parameters are the same, and the cache uses Filter's hashCode() as cache keys,
// and Filters' hashCode() builds on Filter parameters, not the Filter instance itself
rcwfh = new RemoteCachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "a"))), false);
rcwfh = new RemoteCachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "a"))), false);
rcwfh.shouldHaveCache(false);
search(new TermQuery(new Term("test", "test")), rcwfh, 0, "A");
rcwfh = new RemoteCachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "a"))), false);
rcwfh = new RemoteCachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "a"))), false);
rcwfh.shouldHaveCache(true);
search(new TermQuery(new Term("test", "test")), rcwfh, 0, "A");
// assert that we get a non-cached version of the Filter because this is a new Query (type:b)
rcwfh = new RemoteCachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "b"))), false);
rcwfh = new RemoteCachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "b"))), false);
rcwfh.shouldHaveCache(false);
search(new TermQuery(new Term("type", "b")), rcwfh, 0, "B");
}

View File

@ -116,11 +116,11 @@ public class TestRemoteSearchable extends LuceneTestCase {
Searcher searcher = new MultiSearcher(searchables);
Hits hits = searcher.search(
new TermQuery(new Term("test", "text")),
new QueryFilter(new TermQuery(new Term("test", "test"))));
new QueryWrapperFilter(new TermQuery(new Term("test", "test"))));
assertEquals(1, hits.length());
Hits nohits = searcher.search(
new TermQuery(new Term("test", "text")),
new QueryFilter(new TermQuery(new Term("test", "non-existent-term"))));
new QueryWrapperFilter(new TermQuery(new Term("test", "non-existent-term"))));
assertEquals(0, nohits.length());
}
@ -129,7 +129,7 @@ public class TestRemoteSearchable extends LuceneTestCase {
Searchable[] searchables = { getRemote() };
Searcher searcher = new MultiSearcher(searchables);
Hits hits = searcher.search(
new ConstantScoreQuery(new QueryFilter(
new ConstantScoreQuery(new QueryWrapperFilter(
new TermQuery(new Term("test", "test")))));
assertEquals(1, hits.length());
}

View File

@ -1,5 +1,6 @@
package org.apache.lucene.search;
import org.apache.lucene.util.DocIdBitSet;
import org.apache.lucene.util.LuceneTestCase;
import java.util.Random;
@ -95,16 +96,6 @@ public class TestScorerPerf extends LuceneTestCase {
return sets;
}
public static class BitSetFilter extends Filter {
public BitSet set;
public BitSetFilter(BitSet set) {
this.set = set;
}
public BitSet bits(IndexReader reader) throws IOException {
return set;
}
}
public static class CountingHitCollector extends HitCollector {
int count=0;
int sum=0;
@ -137,8 +128,12 @@ public class TestScorerPerf extends LuceneTestCase {
BitSet addClause(BooleanQuery bq, BitSet result) {
BitSet rnd = sets[r.nextInt(sets.length)];
Query q = new ConstantScoreQuery(new BitSetFilter(rnd));
final BitSet rnd = sets[r.nextInt(sets.length)];
Query q = new ConstantScoreQuery(new Filter() {
public DocIdSet getDocIdSet(IndexReader reader) {
return new DocIdBitSet(rnd);
};
});
bq.add(q, BooleanClause.Occur.MUST);
if (validate) {
if (result==null) result = (BitSet)rnd.clone();

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.DocIdBitSet;
import java.io.IOException;
import java.io.Serializable;
@ -571,10 +572,10 @@ implements Serializable {
// a filter that only allows through the first hit
Filter filt = new Filter() {
public BitSet bits(IndexReader reader) throws IOException {
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
BitSet bs = new BitSet(reader.maxDoc());
bs.set(docs1.scoreDocs[0].doc);
return bs;
return new DocIdBitSet(bs);
}
};

View File

@ -56,20 +56,36 @@ public class TestSpanQueryFilter extends LuceneTestCase {
SpanTermQuery query = new SpanTermQuery(new Term("field", English.intToEnglish(10).trim()));
SpanQueryFilter filter = new SpanQueryFilter(query);
SpanFilterResult result = filter.bitSpans(reader);
BitSet bits = result.getBits();
assertTrue("bits is null and it shouldn't be", bits != null);
assertTrue("tenth bit is not on", bits.get(10));
DocIdSet docIdSet = result.getDocIdSet();
assertTrue("docIdSet is null and it shouldn't be", docIdSet != null);
assertContainsDocId("docIdSet doesn't contain docId 10", docIdSet, 10);
List spans = result.getPositions();
assertTrue("spans is null and it shouldn't be", spans != null);
assertTrue("spans Size: " + spans.size() + " is not: " + bits.cardinality(), spans.size() == bits.cardinality());
int size = getDocIdSetSize(docIdSet);
assertTrue("spans Size: " + spans.size() + " is not: " + size, spans.size() == size);
for (Iterator iterator = spans.iterator(); iterator.hasNext();) {
SpanFilterResult.PositionInfo info = (SpanFilterResult.PositionInfo) iterator.next();
assertTrue("info is null and it shouldn't be", info != null);
//The doc should indicate the bit is on
assertTrue("Bit is not on and it should be", bits.get(info.getDoc()));
assertContainsDocId("docIdSet doesn't contain docId " + info.getDoc(), docIdSet, info.getDoc());
//There should be two positions in each
assertTrue("info.getPositions() Size: " + info.getPositions().size() + " is not: " + 2, info.getPositions().size() == 2);
}
reader.close();
}
int getDocIdSetSize(DocIdSet docIdSet) throws Exception {
int size = 0;
DocIdSetIterator it = docIdSet.iterator();
while (it.next()) {
size++;
}
return size;
}
public void assertContainsDocId(String msg, DocIdSet docIdSet, int docId) throws Exception {
DocIdSetIterator it = docIdSet.iterator();
assertTrue(msg, it.skipTo(docId));
assertTrue(msg, it.doc() == docId);
}
}

View File

@ -0,0 +1,203 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import junit.framework.TestCase;
import java.util.Random;
import java.util.BitSet;
/**
* @version $Id$
*/
public class TestOpenBitSet extends TestCase {
static Random rand = new Random();
void doGet(BitSet a, OpenBitSet b) {
int max = a.size();
for (int i=0; i<max; i++) {
if (a.get(i) != b.get(i)) {
fail("mismatch: BitSet=["+i+"]="+a.get(i));
}
}
}
void doNextSetBit(BitSet a, OpenBitSet b) {
int aa=-1,bb=-1;
do {
aa = a.nextSetBit(aa+1);
bb = b.nextSetBit(bb+1);
assertEquals(aa,bb);
} while (aa>=0);
}
// test interleaving different BitSetIterator.next()
void doIterate(BitSet a, OpenBitSet b) {
int aa=-1,bb=-1;
OpenBitSetIterator iterator = new OpenBitSetIterator(b);
do {
aa = a.nextSetBit(aa+1);
if (rand.nextBoolean())
iterator.next();
else
iterator.skipTo(bb+1);
bb = iterator.doc();
assertEquals(aa,bb);
} while (aa>=0);
}
void doRandomSets(int maxSize, int iter) {
BitSet a0=null;
OpenBitSet b0=null;
for (int i=0; i<iter; i++) {
int sz = rand.nextInt(maxSize);
BitSet a = new BitSet(sz);
OpenBitSet b = new OpenBitSet(sz);
// test the various ways of setting bits
if (sz>0) {
int nOper = rand.nextInt(sz);
for (int j=0; j<nOper; j++) {
int idx;
idx = rand.nextInt(sz);
a.set(idx);
b.fastSet(idx);
idx = rand.nextInt(sz);
a.clear(idx);
b.fastClear(idx);
idx = rand.nextInt(sz);
a.flip(idx);
b.fastFlip(idx);
boolean val = b.flipAndGet(idx);
boolean val2 = b.flipAndGet(idx);
assertTrue(val != val2);
val = b.getAndSet(idx);
assertTrue(val2 == val);
assertTrue(b.get(idx));
if (!val) b.fastClear(idx);
assertTrue(b.get(idx) == val);
}
}
// test that the various ways of accessing the bits are equivalent
doGet(a,b);
// test ranges, including possible extension
int fromIndex, toIndex;
fromIndex = rand.nextInt(sz+80);
toIndex = fromIndex + rand.nextInt((sz>>1)+1);
BitSet aa = (BitSet)a.clone(); aa.flip(fromIndex,toIndex);
OpenBitSet bb = (OpenBitSet)b.clone(); bb.flip(fromIndex,toIndex);
doIterate(aa,bb); // a problem here is from flip or doIterate
fromIndex = rand.nextInt(sz+80);
toIndex = fromIndex + rand.nextInt((sz>>1)+1);
aa = (BitSet)a.clone(); aa.clear(fromIndex,toIndex);
bb = (OpenBitSet)b.clone(); bb.clear(fromIndex,toIndex);
doNextSetBit(aa,bb); // a problem here is from clear() or nextSetBit
fromIndex = rand.nextInt(sz+80);
toIndex = fromIndex + rand.nextInt((sz>>1)+1);
aa = (BitSet)a.clone(); aa.set(fromIndex,toIndex);
bb = (OpenBitSet)b.clone(); bb.set(fromIndex,toIndex);
doNextSetBit(aa,bb); // a problem here is from set() or nextSetBit
if (a0 != null) {
assertEquals( a.equals(a0), b.equals(b0));
assertEquals(a.cardinality(), b.cardinality());
BitSet a_and = (BitSet)a.clone(); a_and.and(a0);
BitSet a_or = (BitSet)a.clone(); a_or.or(a0);
BitSet a_xor = (BitSet)a.clone(); a_xor.xor(a0);
BitSet a_andn = (BitSet)a.clone(); a_andn.andNot(a0);
OpenBitSet b_and = (OpenBitSet)b.clone(); assertEquals(b,b_and); b_and.and(b0);
OpenBitSet b_or = (OpenBitSet)b.clone(); b_or.or(b0);
OpenBitSet b_xor = (OpenBitSet)b.clone(); b_xor.xor(b0);
OpenBitSet b_andn = (OpenBitSet)b.clone(); b_andn.andNot(b0);
doIterate(a_and,b_and);
doIterate(a_or,b_or);
doIterate(a_xor,b_xor);
doIterate(a_andn,b_andn);
assertEquals(a_and.cardinality(), b_and.cardinality());
assertEquals(a_or.cardinality(), b_or.cardinality());
assertEquals(a_xor.cardinality(), b_xor.cardinality());
assertEquals(a_andn.cardinality(), b_andn.cardinality());
// test non-mutating popcounts
assertEquals(b_and.cardinality(), OpenBitSet.intersectionCount(b,b0));
assertEquals(b_or.cardinality(), OpenBitSet.unionCount(b,b0));
assertEquals(b_xor.cardinality(), OpenBitSet.xorCount(b,b0));
assertEquals(b_andn.cardinality(), OpenBitSet.andNotCount(b,b0));
}
a0=a;
b0=b;
}
}
// large enough to flush obvious bugs, small enough to run in <.5 sec as part of a
// larger testsuite.
public void testSmall() {
doRandomSets(1200,1000);
}
public void testBig() {
// uncomment to run a bigger test (~2 minutes).
// doRandomSets(2000,200000);
}
public void testEquals() {
OpenBitSet b1 = new OpenBitSet(1111);
OpenBitSet b2 = new OpenBitSet(2222);
assertTrue(b1.equals(b2));
assertTrue(b2.equals(b1));
b1.set(10);
assertFalse(b1.equals(b2));
assertFalse(b2.equals(b1));
b2.set(10);
assertTrue(b1.equals(b2));
assertTrue(b2.equals(b1));
b2.set(2221);
assertFalse(b1.equals(b2));
assertFalse(b2.equals(b1));
b1.set(2221);
assertTrue(b1.equals(b2));
assertTrue(b2.equals(b1));
// try different type of object
assertFalse(b1.equals(new Object()));
}
}

View File

@ -0,0 +1,198 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.BitSet;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
import org.apache.lucene.search.DocIdSetIterator;
public class TestSortedVIntList extends TestCase {
/** Main for running test case by itself. */
public static void main(String args[]) {
TestRunner.run(new TestSuite(TestSortedVIntList.class));
}
void tstIterator (
SortedVIntList vintList,
int[] ints) throws IOException {
for (int i = 0; i < ints.length; i++) {
if ((i > 0) && (ints[i-1] == ints[i])) {
return; // DocNrSkipper should not skip to same document.
}
}
DocIdSetIterator m = vintList.iterator();
for (int i = 0; i < ints.length; i++) {
assertTrue("No end of Matcher at: " + i, m.next());
assertEquals(ints[i], m.doc());
}
assertTrue("End of Matcher", (! m.next()));
}
void tstVIntList(
SortedVIntList vintList,
int[] ints,
int expectedByteSize) throws IOException {
assertEquals("Size", ints.length, vintList.size());
assertEquals("Byte size", expectedByteSize, vintList.getByteSize());
tstIterator(vintList, ints);
}
public void tstViaBitSet(int [] ints, int expectedByteSize) throws IOException {
final int MAX_INT_FOR_BITSET = 1024 * 1024;
BitSet bs = new BitSet();
for (int i = 0; i < ints.length; i++) {
if (ints[i] > MAX_INT_FOR_BITSET) {
return; // BitSet takes too much memory
}
if ((i > 0) && (ints[i-1] == ints[i])) {
return; // BitSet cannot store duplicate.
}
bs.set(ints[i]);
}
SortedVIntList svil = new SortedVIntList(bs);
tstVIntList(svil, ints, expectedByteSize);
tstVIntList(new SortedVIntList(svil.iterator()), ints, expectedByteSize);
}
private static final int VB1 = 0x7F;
private static final int BIT_SHIFT = 7;
private static final int VB2 = (VB1 << BIT_SHIFT) | VB1;
private static final int VB3 = (VB2 << BIT_SHIFT) | VB1;
private static final int VB4 = (VB3 << BIT_SHIFT) | VB1;
private int vIntByteSize(int i) {
assert i >= 0;
if (i <= VB1) return 1;
if (i <= VB2) return 2;
if (i <= VB3) return 3;
if (i <= VB4) return 4;
return 5;
}
private int vIntListByteSize(int [] ints) {
int byteSize = 0;
int last = 0;
for (int i = 0; i < ints.length; i++) {
byteSize += vIntByteSize(ints[i] - last);
last = ints[i];
}
return byteSize;
}
public void tstInts(int [] ints) {
int expectedByteSize = vIntListByteSize(ints);
try {
tstVIntList(new SortedVIntList(ints), ints, expectedByteSize);
tstViaBitSet(ints, expectedByteSize);
} catch (IOException ioe) {
throw new Error(ioe);
}
}
public void tstIllegalArgExc(int [] ints) {
try {
new SortedVIntList(ints);
}
catch (IllegalArgumentException e) {
return;
}
fail("Expected IllegalArgumentException");
}
private int[] fibArray(int a, int b, int size) {
final int[] fib = new int[size];
fib[0] = a;
fib[1] = b;
for (int i = 2; i < size; i++) {
fib[i] = fib[i-1] + fib[i-2];
}
return fib;
}
private int[] reverseDiffs(int []ints) { // reverse the order of the successive differences
final int[] res = new int[ints.length];
for (int i = 0; i < ints.length; i++) {
res[i] = ints[ints.length - 1] + (ints[0] - ints[ints.length - 1 - i]);
}
return res;
}
public void test01() {
tstInts(new int[] {});
}
public void test02() {
tstInts(new int[] {0});
}
public void test03() {
tstInts(new int[] {0,Integer.MAX_VALUE});
}
public void test04a() {
tstInts(new int[] {0, VB2 - 1});
}
public void test04b() {
tstInts(new int[] {0, VB2});
}
public void test04c() {
tstInts(new int[] {0, VB2 + 1});
}
public void test05() {
tstInts(fibArray(0,1,7)); // includes duplicate value 1
}
public void test05b() {
tstInts(reverseDiffs(fibArray(0,1,7)));
}
public void test06() {
tstInts(fibArray(1,2,45)); // no duplicates, size 46 exceeds max int.
}
public void test06b() {
tstInts(reverseDiffs(fibArray(1,2,45)));
}
public void test07a() {
tstInts(new int[] {0, VB3});
}
public void test07b() {
tstInts(new int[] {1, VB3 + 2});
}
public void test07c() {
tstInts(new int[] {2, VB3 + 4});
}
public void test08a() {
tstInts(new int[] {0, VB4 + 1});
}
public void test08b() {
tstInts(new int[] {1, VB4 + 1});
}
public void test08c() {
tstInts(new int[] {2, VB4 + 1});
}
public void test10() {
tstIllegalArgExc(new int[] {-1});
}
public void test11() {
tstIllegalArgExc(new int[] {1,0});
}
public void test12() {
tstIllegalArgExc(new int[] {0,1,1,2,3,5,8,0});
}
}