diff --git a/CHANGES.txt b/CHANGES.txt index ef1285cc8a5..bd907a2fb6c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -15,12 +15,16 @@ API Changes 2. LUCENE-1150: Re-expose StandardTokenizer's constants publicly; this was accidentally lost with LUCENE-966. (Nicolas Lalevée via Mike McCandless) + + 3. LUCENE-584: Changed Filter API to return a DocIdSet instead of a + java.util.BitSet. This allows using more efficient data structures + for Filters and makes them more flexible. (Paul Elschot, Michael Busch) Bug fixes New features -1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis + 1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis process. The flag is not indexed/stored and is thus only used by analysis. 2. LUCENE-1147: Add -segment option to CheckIndex tool so you can @@ -28,6 +32,12 @@ New features McCandless) 3. LUCENE-1045: Reopened this issue to add support for short and bytes. + + 4. LUCENE-584: Added new data structures to o.a.l.util, such as + OpenBitSet and SortedVIntList. These extend DocIdSet and can + directly be used for Filters with the new Filter API. Also changed + the core Filters to use OpenBitSet instead of java.util.BitSet. + (Paul Elschot, Michael Busch) Optimizations diff --git a/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java b/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java index 525e8130d9a..734aa66669f 100644 --- a/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java +++ b/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java @@ -37,8 +37,8 @@ public class ChainedFilterTest extends TestCase { private Query query; // private DateFilter dateFilter; DateFilter was deprecated and removed private RangeFilter dateFilter; - private QueryFilter bobFilter; - private QueryFilter sueFilter; + private QueryWrapperFilter bobFilter; + private QueryWrapperFilter sueFilter; public void setUp() throws Exception { directory = new RAMDirectory(); @@ -74,9 +74,9 @@ public class ChainedFilterTest extends TestCase { // just treat dates as strings and select the whole range for now... dateFilter = new RangeFilter("date","","ZZZZ",true,true); - bobFilter = new QueryFilter( + bobFilter = new QueryWrapperFilter( new TermQuery(new Term("owner", "bob"))); - sueFilter = new QueryFilter( + sueFilter = new QueryWrapperFilter( new TermQuery(new Term("owner", "sue"))); } diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/CachedFilterBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/CachedFilterBuilder.java index ce63169ca8d..08e896b4571 100644 --- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/CachedFilterBuilder.java +++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/CachedFilterBuilder.java @@ -8,7 +8,7 @@ import java.util.Map.Entry; import org.apache.lucene.search.CachingWrapperFilter; import org.apache.lucene.search.Filter; import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryFilter; +import org.apache.lucene.search.QueryWrapperFilter; import org.apache.lucene.xmlparser.DOMUtils; import org.apache.lucene.xmlparser.FilterBuilder; import org.apache.lucene.xmlparser.FilterBuilderFactory; @@ -105,7 +105,7 @@ public class CachedFilterBuilder implements FilterBuilder { //cache miss if (qb != null) { - cachedFilter = new QueryFilter(q); + cachedFilter = new QueryWrapperFilter(q); } else { cachedFilter = new CachingWrapperFilter(f); diff --git a/src/java/org/apache/lucene/search/CachingSpanFilter.java b/src/java/org/apache/lucene/search/CachingSpanFilter.java index dcd9e3fab12..aac5207dc8c 100644 --- a/src/java/org/apache/lucene/search/CachingSpanFilter.java +++ b/src/java/org/apache/lucene/search/CachingSpanFilter.java @@ -43,11 +43,19 @@ public class CachingSpanFilter extends SpanFilter { this.filter = filter; } + /** + * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. + */ public BitSet bits(IndexReader reader) throws IOException { SpanFilterResult result = getCachedResult(reader); return result != null ? result.getBits() : null; } - + + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + SpanFilterResult result = getCachedResult(reader); + return result != null ? result.getDocIdSet() : null; + } + private SpanFilterResult getCachedResult(IndexReader reader) throws IOException { SpanFilterResult result = null; if (cache == null) { diff --git a/src/java/org/apache/lucene/search/CachingWrapperFilter.java b/src/java/org/apache/lucene/search/CachingWrapperFilter.java index b6c0564fb60..92fe7cd1e3b 100644 --- a/src/java/org/apache/lucene/search/CachingWrapperFilter.java +++ b/src/java/org/apache/lucene/search/CachingWrapperFilter.java @@ -43,6 +43,9 @@ public class CachingWrapperFilter extends Filter { this.filter = filter; } + /** + * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. + */ public BitSet bits(IndexReader reader) throws IOException { if (cache == null) { cache = new WeakHashMap(); @@ -63,6 +66,28 @@ public class CachingWrapperFilter extends Filter { return bits; } + + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + if (cache == null) { + cache = new WeakHashMap(); + } + + synchronized (cache) { // check cache + DocIdSet cached = (DocIdSet) cache.get(reader); + if (cached != null) { + return cached; + } + } + + final DocIdSet docIdSet = filter.getDocIdSet(reader); + + synchronized (cache) { // update cache + cache.put(reader, docIdSet); + } + + return docIdSet; + + } public String toString() { return "CachingWrapperFilter("+filter+")"; diff --git a/src/java/org/apache/lucene/search/ConstantScoreQuery.java b/src/java/org/apache/lucene/search/ConstantScoreQuery.java index b8658d0dbec..1a28180894c 100644 --- a/src/java/org/apache/lucene/search/ConstantScoreQuery.java +++ b/src/java/org/apache/lucene/search/ConstantScoreQuery.java @@ -85,7 +85,7 @@ public class ConstantScoreQuery extends Query { public Explanation explain(IndexReader reader, int doc) throws IOException { ConstantScorer cs = (ConstantScorer)scorer(reader); - boolean exists = cs.bits.get(doc); + boolean exists = cs.docIdSetIterator.skipTo(doc) && (cs.docIdSetIterator.doc() == doc); ComplexExplanation result = new ComplexExplanation(); @@ -107,23 +107,22 @@ public class ConstantScoreQuery extends Query { } protected class ConstantScorer extends Scorer { - final BitSet bits; + final DocIdSetIterator docIdSetIterator; final float theScore; int doc=-1; public ConstantScorer(Similarity similarity, IndexReader reader, Weight w) throws IOException { super(similarity); theScore = w.getValue(); - bits = filter.bits(reader); + docIdSetIterator = filter.getDocIdSet(reader).iterator(); } public boolean next() throws IOException { - doc = bits.nextSetBit(doc+1); - return doc >= 0; + return docIdSetIterator.next(); } public int doc() { - return doc; + return docIdSetIterator.doc(); } public float score() throws IOException { @@ -131,8 +130,7 @@ public class ConstantScoreQuery extends Query { } public boolean skipTo(int target) throws IOException { - doc = bits.nextSetBit(target); // requires JDK 1.4 - return doc >= 0; + return docIdSetIterator.skipTo(target); } public Explanation explain(int doc) throws IOException { @@ -170,3 +168,4 @@ public class ConstantScoreQuery extends Query { } + diff --git a/src/java/org/apache/lucene/search/DocIdSet.java b/src/java/org/apache/lucene/search/DocIdSet.java new file mode 100644 index 00000000000..239c6c0d87f --- /dev/null +++ b/src/java/org/apache/lucene/search/DocIdSet.java @@ -0,0 +1,27 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * A DocIdSet contains a set of doc ids. Implementing classes must provide + * a {@link DocIdSetIterator} to access the set. + */ +public abstract class DocIdSet { + public abstract DocIdSetIterator iterator(); +} diff --git a/src/java/org/apache/lucene/search/DocIdSetIterator.java b/src/java/org/apache/lucene/search/DocIdSetIterator.java new file mode 100644 index 00000000000..5f22ebf85f4 --- /dev/null +++ b/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -0,0 +1,49 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * This abstract class defines methods to iterate over a set of + * non-decreasing doc ids. + */ +public abstract class DocIdSetIterator { + /** Returns the current document number.

This is invalid until {@link + #next()} is called for the first time.*/ + public abstract int doc(); + + /** Moves to the next docId in the set. Returns true, iff + * there is such a docId. */ + public abstract boolean next() throws IOException; + + /** Skips entries to the first beyond the current whose document number is + * greater than or equal to target.

Returns true iff there is such + * an entry.

Behaves as if written:

+     *   boolean skipTo(int target) {
+     *     do {
+     *       if (!next())
+     *         return false;
+     *     } while (target > doc());
+     *     return true;
+     *   }
+     * 
+ * Some implementations are considerably more efficient than that. + */ + public abstract boolean skipTo(int target) throws IOException; +} diff --git a/src/java/org/apache/lucene/search/Filter.java b/src/java/org/apache/lucene/search/Filter.java index 008c15d9453..25cffcf8db5 100644 --- a/src/java/org/apache/lucene/search/Filter.java +++ b/src/java/org/apache/lucene/search/Filter.java @@ -20,11 +20,32 @@ package org.apache.lucene.search; import java.util.BitSet; import java.io.IOException; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.DocIdBitSet; -/** Abstract base class providing a mechanism to restrict searches to a subset - of an index. */ +/** Abstract base class providing a mechanism to use a subset of an index + * for restriction or permission of index search results. + *

+ * Note: In Lucene 3.0 {@link #bits(IndexReader)} will be removed + * and {@link #getDocIdSet(IndexReader)} will be defined as abstract. + * All implementing classes must therefore implement {@link #getDocIdSet(IndexReader)} + * in order to work with Lucene 3.0. + */ public abstract class Filter implements java.io.Serializable { - /** Returns a BitSet with true for documents which should be permitted in - search results, and false for those that should not. */ - public abstract BitSet bits(IndexReader reader) throws IOException; + /** + * @return A BitSet with true for documents which should be permitted in + * search results, and false for those that should not. + * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. + */ + public BitSet bits(IndexReader reader) throws IOException { + return null; + } + + /** + * @return a DocIdSet that provides the documents which should be + * permitted or prohibited in search results. + * @see DocIdBitSet + */ + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + return new DocIdBitSet(bits(reader)); + } } diff --git a/src/java/org/apache/lucene/search/FilteredQuery.java b/src/java/org/apache/lucene/search/FilteredQuery.java index 516f73a9f91..7729bef6044 100644 --- a/src/java/org/apache/lucene/search/FilteredQuery.java +++ b/src/java/org/apache/lucene/search/FilteredQuery.java @@ -21,7 +21,6 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; -import java.util.BitSet; import java.util.Set; @@ -47,7 +46,7 @@ extends Query { /** * Constructs a new query which applies a filter to the results of the original query. - * Filter.bits() will be called every time this query is used in a search. + * Filter.getDocIdSet() will be called every time this query is used in a search. * @param query Query to be filtered, cannot be null. * @param filter Filter to apply to query results, cannot be null. */ @@ -86,13 +85,15 @@ extends Query { inner.addDetail(preBoost); } Filter f = FilteredQuery.this.filter; - BitSet matches = f.bits(ir); - if (matches.get(i)) + DocIdSetIterator docIdSetIterator = f.getDocIdSet(ir).iterator(); + if (docIdSetIterator.skipTo(i) && (docIdSetIterator.doc() == i)) { return inner; - Explanation result = new Explanation - (0.0f, "failure to match filter: " + f.toString()); - result.addDetail(inner); - return result; + } else { + Explanation result = new Explanation + (0.0f, "failure to match filter: " + f.toString()); + result.addDetail(inner); + return result; + } } // return this query @@ -100,50 +101,49 @@ extends Query { // return a filtering scorer public Scorer scorer (IndexReader indexReader) throws IOException { - final Scorer scorer = weight.scorer (indexReader); - final BitSet bitset = filter.bits (indexReader); - return new Scorer (similarity) { + final Scorer scorer = weight.scorer(indexReader); + final DocIdSetIterator docIdSetIterator = filter.getDocIdSet(indexReader).iterator(); - public boolean next() throws IOException { - do { - if (! scorer.next()) { + return new Scorer(similarity) { + + private boolean advanceToCommon() throws IOException { + while (scorer.doc() != docIdSetIterator.doc()) { + if (scorer.doc() < docIdSetIterator.doc()) { + if (!scorer.skipTo(docIdSetIterator.doc())) { + return false; + } + } else if (!docIdSetIterator.skipTo(scorer.doc())) { return false; } - } while (! bitset.get(scorer.doc())); - /* When skipTo() is allowed on scorer it should be used here - * in combination with bitset.nextSetBit(...) - * See the while loop in skipTo() below. - */ + } return true; } + + public boolean next() throws IOException { + return docIdSetIterator.next() && scorer.next() && advanceToCommon(); + } + public int doc() { return scorer.doc(); } public boolean skipTo(int i) throws IOException { - if (! scorer.skipTo(i)) { - return false; - } - while (! bitset.get(scorer.doc())) { - int nextFiltered = bitset.nextSetBit(scorer.doc() + 1); - if (nextFiltered == -1) { - return false; - } else if (! scorer.skipTo(nextFiltered)) { - return false; - } - } - return true; - } + return docIdSetIterator.skipTo(i) + && scorer.skipTo(docIdSetIterator.doc()) + && advanceToCommon(); + } public float score() throws IOException { return getBoost() * scorer.score(); } // add an explanation about whether the document was filtered public Explanation explain (int i) throws IOException { - Explanation exp = scorer.explain (i); - exp.setValue(getBoost() * exp.getValue()); + Explanation exp = scorer.explain(i); - if (bitset.get(i)) + if (docIdSetIterator.skipTo(i) && (docIdSetIterator.doc() == i)) { exp.setDescription ("allowed by filter: "+exp.getDescription()); - else + exp.setValue(getBoost() * exp.getValue()); + } else { exp.setDescription ("removed by filter: "+exp.getDescription()); + exp.setValue(0.0f); + } return exp; } }; diff --git a/src/java/org/apache/lucene/search/IndexSearcher.java b/src/java/org/apache/lucene/search/IndexSearcher.java index 031a8d55a77..cb9ef692355 100644 --- a/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/src/java/org/apache/lucene/search/IndexSearcher.java @@ -128,22 +128,33 @@ public class IndexSearcher extends Searcher { // inherit javadoc public void search(Weight weight, Filter filter, final HitCollector results) throws IOException { - HitCollector collector = results; - if (filter != null) { - final BitSet bits = filter.bits(reader); - collector = new HitCollector() { - public final void collect(int doc, float score) { - if (bits.get(doc)) { // skip docs not in bits - results.collect(doc, score); - } - } - }; - } Scorer scorer = weight.scorer(reader); if (scorer == null) return; - scorer.score(collector); + + if (filter == null) { + scorer.score(results); + return; + } + + DocIdSetIterator docIdSetIterator = filter.getDocIdSet(reader).iterator(); // CHECKME: use ConjunctionScorer here? + boolean more = docIdSetIterator.next(); + while (more) { + int filterDocId = docIdSetIterator.doc(); + if (! scorer.skipTo(filterDocId)) { + more = false; + } else { + int scorerDocId = scorer.doc(); + if (scorerDocId == filterDocId) { // permitted by filter + results.collect(scorerDocId, scorer.score()); + more = docIdSetIterator.skipTo(scorerDocId + 1); + } else { + more = docIdSetIterator.skipTo(scorerDocId); + } + } + } + } public Query rewrite(Query original) throws IOException { diff --git a/src/java/org/apache/lucene/search/PrefixFilter.java b/src/java/org/apache/lucene/search/PrefixFilter.java index a624163ce89..cac0ecd9eca 100755 --- a/src/java/org/apache/lucene/search/PrefixFilter.java +++ b/src/java/org/apache/lucene/search/PrefixFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import org.apache.lucene.search.Filter; +import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermEnum; @@ -39,6 +40,9 @@ public class PrefixFilter extends Filter { public Term getPrefix() { return prefix; } + /** + * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. + */ public BitSet bits(IndexReader reader) throws IOException { final BitSet bitSet = new BitSet(reader.maxDoc()); new PrefixGenerator(prefix) { @@ -48,6 +52,16 @@ public class PrefixFilter extends Filter { }.generate(reader); return bitSet; } + + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); + new PrefixGenerator(prefix) { + public void handleDoc(int doc) { + bitSet.set(doc); + } + }.generate(reader); + return bitSet; + } /** Prints a user-readable version of this query. */ public String toString () { @@ -105,3 +119,4 @@ abstract class PrefixGenerator implements IdGenerator { } + diff --git a/src/java/org/apache/lucene/search/QueryWrapperFilter.java b/src/java/org/apache/lucene/search/QueryWrapperFilter.java index 2db05572ad8..d36dac7769d 100644 --- a/src/java/org/apache/lucene/search/QueryWrapperFilter.java +++ b/src/java/org/apache/lucene/search/QueryWrapperFilter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.BitSet; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.OpenBitSet; /** * Constrains search results to only match those which also match a provided @@ -44,6 +45,9 @@ public class QueryWrapperFilter extends Filter { this.query = query; } + /** + * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. + */ public BitSet bits(IndexReader reader) throws IOException { final BitSet bits = new BitSet(reader.maxDoc()); @@ -54,6 +58,17 @@ public class QueryWrapperFilter extends Filter { }); return bits; } + + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + + new IndexSearcher(reader).search(query, new HitCollector() { + public final void collect(int doc, float score) { + bits.set(doc); // set bit for hit + } + }); + return bits; + } public String toString() { return "QueryWrapperFilter(" + query + ")"; diff --git a/src/java/org/apache/lucene/search/RangeFilter.java b/src/java/org/apache/lucene/search/RangeFilter.java index 75a65d77cbe..049f1fc8c21 100644 --- a/src/java/org/apache/lucene/search/RangeFilter.java +++ b/src/java/org/apache/lucene/search/RangeFilter.java @@ -21,6 +21,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.OpenBitSet; import java.io.IOException; import java.util.BitSet; @@ -94,6 +95,7 @@ public class RangeFilter extends Filter { * Returns a BitSet with true for documents which should be * permitted in search results, and false for those that should * not. + * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. */ public BitSet bits(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); @@ -152,6 +154,68 @@ public class RangeFilter extends Filter { return bits; } + /** + * Returns a DocIdSet with documents that should be + * permitted in search results. + */ + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + OpenBitSet bits = new OpenBitSet(reader.maxDoc()); + + TermEnum enumerator = + (null != lowerTerm + ? reader.terms(new Term(fieldName, lowerTerm)) + : reader.terms(new Term(fieldName,""))); + + try { + + if (enumerator.term() == null) { + return bits; + } + + boolean checkLower = false; + if (!includeLower) // make adjustments to set to exclusive + checkLower = true; + + TermDocs termDocs = reader.termDocs(); + try { + + do { + Term term = enumerator.term(); + if (term != null && term.field().equals(fieldName)) { + if (!checkLower || null==lowerTerm || term.text().compareTo(lowerTerm) > 0) { + checkLower = false; + if (upperTerm != null) { + int compare = upperTerm.compareTo(term.text()); + /* if beyond the upper term, or is exclusive and + * this is equal to the upper term, break out */ + if ((compare < 0) || + (!includeUpper && compare==0)) { + break; + } + } + /* we have a good term, find the docs */ + + termDocs.seek(enumerator.term()); + while (termDocs.next()) { + bits.set(termDocs.doc()); + } + } + } else { + break; + } + } + while (enumerator.next()); + + } finally { + termDocs.close(); + } + } finally { + enumerator.close(); + } + + return bits; + } + public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append(fieldName); diff --git a/src/java/org/apache/lucene/search/RemoteCachingWrapperFilter.java b/src/java/org/apache/lucene/search/RemoteCachingWrapperFilter.java index 9ac6d40688b..a1dc1856bf3 100644 --- a/src/java/org/apache/lucene/search/RemoteCachingWrapperFilter.java +++ b/src/java/org/apache/lucene/search/RemoteCachingWrapperFilter.java @@ -50,9 +50,21 @@ public class RemoteCachingWrapperFilter extends Filter { * searcher side of a remote connection. * @param reader the index reader for the Filter * @return the bitset + * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. */ public BitSet bits(IndexReader reader) throws IOException { Filter cachedFilter = FilterManager.getInstance().getFilter(filter); return cachedFilter.bits(reader); } + + /** + * Uses the {@link FilterManager} to keep the cache for a filter on the + * searcher side of a remote connection. + * @param reader the index reader for the Filter + * @return the DocIdSet + */ + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + Filter cachedFilter = FilterManager.getInstance().getFilter(filter); + return cachedFilter.getDocIdSet(reader); + } } diff --git a/src/java/org/apache/lucene/search/Scorer.java b/src/java/org/apache/lucene/search/Scorer.java index 2da2738633a..21cd5986746 100644 --- a/src/java/org/apache/lucene/search/Scorer.java +++ b/src/java/org/apache/lucene/search/Scorer.java @@ -33,7 +33,7 @@ import java.io.IOException; *

* @see BooleanQuery#setAllowDocsOutOfOrder */ -public abstract class Scorer { +public abstract class Scorer extends DocIdSetIterator { private Similarity similarity; /** Constructs a Scorer. @@ -76,65 +76,12 @@ public abstract class Scorer { return true; } - /** - * Advances to the document matching this Scorer with the lowest doc Id - * greater than the current value of {@link #doc()} (or to the matching - * document with the lowest doc Id if next has never been called on - * this Scorer). - * - *

- * When this method is used the {@link #explain(int)} method should not - * be used. - *

- * - * @return true iff there is another document matching the query. - * @see BooleanQuery#setAllowDocsOutOfOrder - */ - public abstract boolean next() throws IOException; - - /** Returns the current document number matching the query. - * Initially invalid, until {@link #next()} is called the first time. - */ - public abstract int doc(); - /** Returns the score of the current document matching the query. * Initially invalid, until {@link #next()} or {@link #skipTo(int)} * is called the first time. */ public abstract float score() throws IOException; - /** - * Skips to the document matching this Scorer with the lowest doc Id - * greater than or equal to a given target. - * - *

- * The behavior of this method is undefined if the target specified is - * less than or equal to the current value of {@link #doc()}. - *

- * Behaves as if written: - *

-   *   boolean skipTo(int target) {
-   *     do {
-   *       if (!next())
-   * 	     return false;
-   *     } while (target > doc());
-   *     return true;
-   *   }
-   * 
- * Most implementations are considerably more efficient than that. - *

- * - *

- * When this method is used the {@link #explain(int)} method should not - * be used. - *

- * - * @param target The target document number. - * @return true iff there is such a match. - * @see BooleanQuery#setAllowDocsOutOfOrder - */ - public abstract boolean skipTo(int target) throws IOException; - /** Returns an explanation of the score for a document. *
When this method is used, the {@link #next()}, {@link #skipTo(int)} and * {@link #score(HitCollector)} methods should not be used. diff --git a/src/java/org/apache/lucene/search/Searchable.java b/src/java/org/apache/lucene/search/Searchable.java index 307e23c53ca..30d1858b59c 100644 --- a/src/java/org/apache/lucene/search/Searchable.java +++ b/src/java/org/apache/lucene/search/Searchable.java @@ -48,7 +48,7 @@ public interface Searchable extends java.rmi.Remote { * non-high-scoring hits. * * @param weight to match documents - * @param filter if non-null, a bitset used to eliminate some documents + * @param filter if non-null, used to permit documents to be collected. * @param results to receive hits * @throws BooleanQuery.TooManyClauses */ diff --git a/src/java/org/apache/lucene/search/Searcher.java b/src/java/org/apache/lucene/search/Searcher.java index cfb4150c981..77e4a9a2dbf 100644 --- a/src/java/org/apache/lucene/search/Searcher.java +++ b/src/java/org/apache/lucene/search/Searcher.java @@ -109,7 +109,7 @@ public abstract class Searcher implements Searchable { * non-high-scoring hits. * * @param query to match documents - * @param filter if non-null, a bitset used to eliminate some documents + * @param filter if non-null, used to permit documents to be collected. * @param results to receive hits * @throws BooleanQuery.TooManyClauses */ diff --git a/src/java/org/apache/lucene/search/SpanFilter.java b/src/java/org/apache/lucene/search/SpanFilter.java index b79d551444e..e2a5946c166 100644 --- a/src/java/org/apache/lucene/search/SpanFilter.java +++ b/src/java/org/apache/lucene/search/SpanFilter.java @@ -30,7 +30,7 @@ import java.io.IOException; public abstract class SpanFilter extends Filter{ /** Returns a SpanFilterResult with true for documents which should be permitted in search results, and false for those that should not and Spans for where the true docs match. - * @param reader The {@link org.apache.lucene.index.IndexReader} to load position and bitset information from + * @param reader The {@link org.apache.lucene.index.IndexReader} to load position and DocIdSet information from * @return A {@link SpanFilterResult} * @throws java.io.IOException if there was an issue accessing the necessary information * */ diff --git a/src/java/org/apache/lucene/search/SpanFilterResult.java b/src/java/org/apache/lucene/search/SpanFilterResult.java index 0dfc794927b..5052e93a55d 100644 --- a/src/java/org/apache/lucene/search/SpanFilterResult.java +++ b/src/java/org/apache/lucene/search/SpanFilterResult.java @@ -28,19 +28,33 @@ import java.util.List; * **/ public class SpanFilterResult { + /** @deprecated */ private BitSet bits; + + private DocIdSet docIdSet; private List positions;//Spans spans; /** * * @param bits The bits for the Filter * @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects + * @deprecated Use {@link #SpanFilterResult(DocIdSet, List)} instead */ public SpanFilterResult(BitSet bits, List positions) { this.bits = bits; this.positions = positions; } - + + /** + * + * @param docIdSet The DocIdSet for the Filter + * @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects + */ + public SpanFilterResult(DocIdSet docIdSet, List positions) { + this.docIdSet = docIdSet; + this.positions = positions; + } + /** * The first entry in the array corresponds to the first "on" bit. * Entries are increasing by document order @@ -50,11 +64,17 @@ public class SpanFilterResult { return positions; } + /** + * @deprecated Use {@link #getDocIdSet()} + */ public BitSet getBits() { return bits; } - + /** Returns the docIdSet */ + public DocIdSet getDocIdSet() { + return docIdSet; + } public static class PositionInfo { private int doc; @@ -115,3 +135,4 @@ public class SpanFilterResult { } + diff --git a/src/java/org/apache/lucene/search/SpanQueryFilter.java b/src/java/org/apache/lucene/search/SpanQueryFilter.java index 17b44e52480..317c130d264 100644 --- a/src/java/org/apache/lucene/search/SpanQueryFilter.java +++ b/src/java/org/apache/lucene/search/SpanQueryFilter.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.util.OpenBitSet; import java.io.IOException; import java.util.ArrayList; @@ -54,15 +55,14 @@ public class SpanQueryFilter extends SpanFilter { this.query = query; } - public BitSet bits(IndexReader reader) throws IOException { + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { SpanFilterResult result = bitSpans(reader); - return result.getBits(); + return result.getDocIdSet(); } - public SpanFilterResult bitSpans(IndexReader reader) throws IOException { - final BitSet bits = new BitSet(reader.maxDoc()); + final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); Spans spans = query.getSpans(reader); List tmp = new ArrayList(20); int currentDoc = -1; diff --git a/src/java/org/apache/lucene/util/BitUtil.java b/src/java/org/apache/lucene/util/BitUtil.java new file mode 100644 index 00000000000..a9b283f1964 --- /dev/null +++ b/src/java/org/apache/lucene/util/BitUtil.java @@ -0,0 +1,799 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util; // from org.apache.solr.util rev 555343 + +/** A variety of high efficiencly bit twiddling routines. + * + * @version $Id$ + */ +public class BitUtil { + + /** Returns the number of bits set in the long */ + public static int pop(long x) { + /* Hacker's Delight 32 bit pop function: + * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc + * + int pop(unsigned x) { + x = x - ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0F0F0F0F; + x = x + (x >> 8); + x = x + (x >> 16); + return x & 0x0000003F; + } + ***/ + + // 64 bit java version of the C function from above + x = x - ((x >>> 1) & 0x5555555555555555L); + x = (x & 0x3333333333333333L) + ((x >>>2 ) & 0x3333333333333333L); + x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL; + x = x + (x >>> 8); + x = x + (x >>> 16); + x = x + (x >>> 32); + return ((int)x) & 0x7F; + } + + /*** Returns the number of set bits in an array of longs. */ + public static long pop_array(long A[], int wordOffset, int numWords) { + /* + * Robert Harley and David Seal's bit counting algorithm, as documented + * in the revisions of Hacker's Delight + * http://www.hackersdelight.org/revisions.pdf + * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc + * + * This function was adapted to Java, and extended to use 64 bit words. + * if only we had access to wider registers like SSE from java... + * + * This function can be transformed to compute the popcount of other functions + * on bitsets via something like this: + * sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g' + * + */ + int n = wordOffset+numWords; + long tot=0, tot8=0; + long ones=0, twos=0, fours=0; + + int i; + for (i = wordOffset; i <= n - 8; i+=8) { + /*** C macro from Hacker's Delight + #define CSA(h,l, a,b,c) \ + {unsigned u = a ^ b; unsigned v = c; \ + h = (a & b) | (u & v); l = u ^ v;} + ***/ + + long twosA,twosB,foursA,foursB,eights; + + // CSA(twosA, ones, ones, A[i], A[i+1]) + { + long b=A[i], c=A[i+1]; + long u=ones ^ b; + twosA=(ones & b)|( u & c); + ones=u^c; + } + // CSA(twosB, ones, ones, A[i+2], A[i+3]) + { + long b=A[i+2], c=A[i+3]; + long u=ones^b; + twosB =(ones&b)|(u&c); + ones=u^c; + } + //CSA(foursA, twos, twos, twosA, twosB) + { + long u=twos^twosA; + foursA=(twos&twosA)|(u&twosB); + twos=u^twosB; + } + //CSA(twosA, ones, ones, A[i+4], A[i+5]) + { + long b=A[i+4], c=A[i+5]; + long u=ones^b; + twosA=(ones&b)|(u&c); + ones=u^c; + } + // CSA(twosB, ones, ones, A[i+6], A[i+7]) + { + long b=A[i+6], c=A[i+7]; + long u=ones^b; + twosB=(ones&b)|(u&c); + ones=u^c; + } + //CSA(foursB, twos, twos, twosA, twosB) + { + long u=twos^twosA; + foursB=(twos&twosA)|(u&twosB); + twos=u^twosB; + } + + //CSA(eights, fours, fours, foursA, foursB) + { + long u=fours^foursA; + eights=(fours&foursA)|(u&foursB); + fours=u^foursB; + } + tot8 += pop(eights); + } + + // handle trailing words in a binary-search manner... + // derived from the loop above by setting specific elements to 0. + // the original method in Hackers Delight used a simple for loop: + // for (i = i; i < n; i++) // Add in the last elements + // tot = tot + pop(A[i]); + + if (i<=n-4) { + long twosA, twosB, foursA, eights; + { + long b=A[i], c=A[i+1]; + long u=ones ^ b; + twosA=(ones & b)|( u & c); + ones=u^c; + } + { + long b=A[i+2], c=A[i+3]; + long u=ones^b; + twosB =(ones&b)|(u&c); + ones=u^c; + } + { + long u=twos^twosA; + foursA=(twos&twosA)|(u&twosB); + twos=u^twosB; + } + eights=fours&foursA; + fours=fours^foursA; + + tot8 += pop(eights); + i+=4; + } + + if (i<=n-2) { + long b=A[i], c=A[i+1]; + long u=ones ^ b; + long twosA=(ones & b)|( u & c); + ones=u^c; + + long foursA=twos&twosA; + twos=twos^twosA; + + long eights=fours&foursA; + fours=fours^foursA; + + tot8 += pop(eights); + i+=2; + } + + if (i>= 1 + return i + print ','.join([ str(ntz(i)) for i in range(256) ]) + ***/ + /** table of number of trailing zeros in a byte */ + public static final byte[] ntzTable = {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0}; + + + /** Returns number of trailing zeros in the 64 bit long value. */ + public static int ntz(long val) { + // A full binary search to determine the low byte was slower than + // a linear search for nextSetBit(). This is most likely because + // the implementation of nextSetBit() shifts bits to the right, increasing + // the probability that the first non-zero byte is in the rhs. + // + // This implementation does a single binary search at the top level only + // so that all other bit shifting can be done on ints instead of longs to + // remain friendly to 32 bit architectures. In addition, the case of a + // non-zero first byte is checked for first because it is the most common + // in dense bit arrays. + + int lower = (int)val; + int lowByte = lower & 0xff; + if (lowByte != 0) return ntzTable[lowByte]; + + if (lower!=0) { + lowByte = (lower>>>8) & 0xff; + if (lowByte != 0) return ntzTable[lowByte] + 8; + lowByte = (lower>>>16) & 0xff; + if (lowByte != 0) return ntzTable[lowByte] + 16; + // no need to mask off low byte for the last byte in the 32 bit word + // no need to check for zero on the last byte either. + return ntzTable[lower>>>24] + 24; + } else { + // grab upper 32 bits + int upper=(int)(val>>32); + lowByte = upper & 0xff; + if (lowByte != 0) return ntzTable[lowByte] + 32; + lowByte = (upper>>>8) & 0xff; + if (lowByte != 0) return ntzTable[lowByte] + 40; + lowByte = (upper>>>16) & 0xff; + if (lowByte != 0) return ntzTable[lowByte] + 48; + // no need to mask off low byte for the last byte in the 32 bit word + // no need to check for zero on the last byte either. + return ntzTable[upper>>>24] + 56; + } + } + + /** returns 0 based index of first set bit + * (only works for x!=0) + *
This is an alternate implementation of ntz() + */ + public static int ntz2(long x) { + int n = 0; + int y = (int)x; + if (y==0) {n+=32; y = (int)(x>>>32); } // the only 64 bit shift necessary + if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; } + if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; } + return (ntzTable[ y & 0xff ]) + n; + } + + /** returns 0 based index of first set bit + *
This is an alternate implementation of ntz() + */ + public static int ntz3(long x) { + // another implementation taken from Hackers Delight, extended to 64 bits + // and converted to Java. + // Many 32 bit ntz algorithms are at http://www.hackersdelight.org/HDcode/ntz.cc + int n = 1; + + // do the first step as a long, all others as ints. + int y = (int)x; + if (y==0) {n+=32; y = (int)(x>>>32); } + if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; } + if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; } + if ((y & 0x0000000F) == 0) { n+=4; y>>>=4; } + if ((y & 0x00000003) == 0) { n+=2; y>>>=2; } + return n - (y & 1); + } + + + /** returns true if v is a power of two or zero*/ + public static boolean isPowerOfTwo(int v) { + return ((v & (v-1)) == 0); + } + + /** returns true if v is a power of two or zero*/ + public static boolean isPowerOfTwo(long v) { + return ((v & (v-1)) == 0); + } + + /** returns the next highest power of two, or the current value if it's already a power of two or zero*/ + public static int nextHighestPowerOfTwo(int v) { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; + } + + /** returns the next highest power of two, or the current value if it's already a power of two or zero*/ + public static long nextHighestPowerOfTwo(long v) { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + v++; + return v; + } + +} diff --git a/src/java/org/apache/lucene/util/DocIdBitSet.java b/src/java/org/apache/lucene/util/DocIdBitSet.java new file mode 100644 index 00000000000..0bea485fb75 --- /dev/null +++ b/src/java/org/apache/lucene/util/DocIdBitSet.java @@ -0,0 +1,77 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.BitSet; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + + +/** Simple DocIdSet and DocIdSetIterator backed by a BitSet */ +public class DocIdBitSet extends DocIdSet { + private BitSet bitSet; + + public DocIdBitSet(BitSet bitSet) { + this.bitSet = bitSet; + } + + public DocIdSetIterator iterator() { + return new DocIdBitSetIterator(bitSet); + } + + /** + * Returns the underlying BitSet. + */ + public BitSet getBitSet() { + return this.bitSet; + } + + private static class DocIdBitSetIterator extends DocIdSetIterator { + private int docId; + private BitSet bitSet; + + DocIdBitSetIterator(BitSet bitSet) { + this.bitSet = bitSet; + this.docId = -1; + } + + public int doc() { + assert docId != -1; + return docId; + } + + public boolean next() { + // (docId + 1) on next line requires -1 initial value for docNr: + return checkNextDocId(bitSet.nextSetBit(docId + 1)); + } + + public boolean skipTo(int skipDocNr) { + return checkNextDocId( bitSet.nextSetBit(skipDocNr)); + } + + private boolean checkNextDocId(int d) { + if (d == -1) { // -1 returned by BitSet.nextSetBit() when exhausted + docId = Integer.MAX_VALUE; + return false; + } else { + docId = d; + return true; + } + } + } +} diff --git a/src/java/org/apache/lucene/util/OpenBitSet.java b/src/java/org/apache/lucene/util/OpenBitSet.java new file mode 100644 index 00000000000..97e144aa99d --- /dev/null +++ b/src/java/org/apache/lucene/util/OpenBitSet.java @@ -0,0 +1,773 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util; + +import java.util.Arrays; +import java.io.Serializable; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + +/** An "open" BitSet implementation that allows direct access to the array of words + * storing the bits. + *

+ * Unlike java.util.bitet, the fact that bits are packed into an array of longs + * is part of the interface. This allows efficient implementation of other algorithms + * by someone other than the author. It also allows one to efficiently implement + * alternate serialization or interchange formats. + *

+ * OpenBitSet is faster than java.util.BitSet in most operations + * and *much* faster at calculating cardinality of sets and results of set operations. + * It can also handle sets of larger cardinality (up to 64 * 2**32-1) + *

+ * The goals of OpenBitSet are the fastest implementation possible, and + * maximum code reuse. Extra safety and encapsulation + * may always be built on top, but if that's built in, the cost can never be removed (and + * hence people re-implement their own version in order to get better performance). + * If you want a "safe", totally encapsulated (and slower and limited) BitSet + * class, use java.util.BitSet. + *

+ *

Performance Results

+ * + Test system: Pentium 4, Sun Java 1.5_06 -server -Xbatch -Xmx64M +
BitSet size = 1,000,000 +
Results are java.util.BitSet time divided by OpenBitSet time. + + + + + + + + + + +
cardinality intersect_count union nextSetBit get iterator
50% full 3.36 3.96 1.44 1.46 1.99 1.58
1% full 3.31 3.90   1.04   0.99
+
+Test system: AMD Opteron, 64 bit linux, Sun Java 1.5_06 -server -Xbatch -Xmx64M +
BitSet size = 1,000,000 +
Results are java.util.BitSet time divided by OpenBitSet time. + + + + + + + + + + +
cardinality intersect_count union nextSetBit get iterator
50% full 2.50 3.50 1.00 1.03 1.12 1.25
1% full 2.51 3.49   1.00   1.02
+ + * @version $Id$ + */ + +public class OpenBitSet extends DocIdSet implements Cloneable, Serializable { + protected long[] bits; + protected int wlen; // number of words (elements) used in the array + + /** Constructs an OpenBitSet large enough to hold numBits. + * + * @param numBits + */ + public OpenBitSet(long numBits) { + bits = new long[bits2words(numBits)]; + wlen = bits.length; + } + + public OpenBitSet() { + this(64); + } + + /** Constructs an OpenBitSet from an existing long[]. + *
+ * The first 64 bits are in long[0], + * with bit index 0 at the least significant bit, and bit index 63 at the most significant. + * Given a bit index, + * the word containing it is long[index/64], and it is at bit number index%64 within that word. + *

+ * numWords are the number of elements in the array that contain + * set bits (non-zero longs). + * numWords should be <= bits.length, and + * any existing words in the array at position >= numWords should be zero. + * + */ + public OpenBitSet(long[] bits, int numWords) { + this.bits = bits; + this.wlen = numWords; + } + + public DocIdSetIterator iterator() { + return new OpenBitSetIterator(bits, wlen); + } + + /** Returns the current capacity in bits (1 greater than the index of the last bit) */ + public long capacity() { return bits.length << 6; } + + /** + * Returns the current capacity of this set. Included for + * compatibility. This is *not* equal to {@link #cardinality} + */ + public long size() { + return capacity(); + } + + /** Returns true if there are no set bits */ + public boolean isEmpty() { return cardinality()==0; } + + /** Expert: returns the long[] storing the bits */ + public long[] getBits() { return bits; } + + /** Expert: sets a new long[] to use as the bit storage */ + public void setBits(long[] bits) { this.bits = bits; } + + /** Expert: gets the number of longs in the array that are in use */ + public int getNumWords() { return wlen; } + + /** Expert: sets the number of longs in the array that are in use */ + public void setNumWords(int nWords) { this.wlen=nWords; } + + + + /** Returns true or false for the specified bit index. */ + public boolean get(int index) { + int i = index >> 6; // div 64 + // signed shift will keep a negative index and force an + // array-index-out-of-bounds-exception, removing the need for an explicit check. + if (i>=bits.length) return false; + + int bit = index & 0x3f; // mod 64 + long bitmask = 1L << bit; + return (bits[i] & bitmask) != 0; + } + + + /** Returns true or false for the specified bit index. + * The index should be less than the OpenBitSet size + */ + public boolean fastGet(int index) { + int i = index >> 6; // div 64 + // signed shift will keep a negative index and force an + // array-index-out-of-bounds-exception, removing the need for an explicit check. + int bit = index & 0x3f; // mod 64 + long bitmask = 1L << bit; + return (bits[i] & bitmask) != 0; + } + + + + /** Returns true or false for the specified bit index + * The index should be less than the OpenBitSet size + */ + public boolean get(long index) { + int i = (int)(index >> 6); // div 64 + if (i>=bits.length) return false; + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + return (bits[i] & bitmask) != 0; + } + + /** Returns true or false for the specified bit index. Allows specifying + * an index outside the current size. */ + public boolean fastGet(long index) { + int i = (int)(index >> 6); // div 64 + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + return (bits[i] & bitmask) != 0; + } + + /* + // alternate implementation of get() + public boolean get1(int index) { + int i = index >> 6; // div 64 + int bit = index & 0x3f; // mod 64 + return ((bits[i]>>>bit) & 0x01) != 0; + // this does a long shift and a bittest (on x86) vs + // a long shift, and a long AND, (the test for zero is prob a no-op) + // testing on a P4 indicates this is slower than (bits[i] & bitmask) != 0; + } + */ + + + /** returns 1 if the bit is set, 0 if not. + * The index should be less than the OpenBitSet size + */ + public int getBit(int index) { + int i = index >> 6; // div 64 + int bit = index & 0x3f; // mod 64 + return ((int)(bits[i]>>>bit)) & 0x01; + } + + + /* + public boolean get2(int index) { + int word = index >> 6; // div 64 + int bit = index & 0x0000003f; // mod 64 + return (bits[word] << bit) < 0; // hmmm, this would work if bit order were reversed + // we could right shift and check for parity bit, if it was available to us. + } + */ + + /** sets a bit, expanding the set size if necessary */ + public void set(long index) { + int wordNum = expandingWordNum(index); + int bit = (int)index & 0x3f; + long bitmask = 1L << bit; + bits[wordNum] |= bitmask; + } + + + /** Sets the bit at the specified index. + * The index should be less than the OpenBitSet size. + */ + public void fastSet(int index) { + int wordNum = index >> 6; // div 64 + int bit = index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] |= bitmask; + } + + /** Sets the bit at the specified index. + * The index should be less than the OpenBitSet size. + */ + public void fastSet(long index) { + int wordNum = (int)(index >> 6); + int bit = (int)index & 0x3f; + long bitmask = 1L << bit; + bits[wordNum] |= bitmask; + } + + /** Sets a range of bits, expanding the set size if necessary + * + * @param startIndex lower index + * @param endIndex one-past the last bit to set + */ + public void set(long startIndex, long endIndex) { + if (endIndex <= startIndex) return; + + int startWord = (int)(startIndex>>6); + + // since endIndex is one past the end, this is index of the last + // word to be changed. + int endWord = expandingWordNum(endIndex-1); + + long startmask = -1L << startIndex; + long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap + + if (startWord == endWord) { + bits[startWord] |= (startmask & endmask); + return; + } + + bits[startWord] |= startmask; + Arrays.fill(bits, startWord+1, endWord, -1L); + bits[endWord] |= endmask; + } + + + + protected int expandingWordNum(long index) { + int wordNum = (int)(index >> 6); + if (wordNum>=wlen) { + ensureCapacity(index+1); + wlen = wordNum+1; + } + return wordNum; + } + + + /** clears a bit. + * The index should be less than the OpenBitSet size. + */ + public void fastClear(int index) { + int wordNum = index >> 6; + int bit = index & 0x03f; + long bitmask = 1L << bit; + bits[wordNum] &= ~bitmask; + // hmmm, it takes one more instruction to clear than it does to set... any + // way to work around this? If there were only 63 bits per word, we could + // use a right shift of 10111111...111 in binary to position the 0 in the + // correct place (using sign extension). + // Could also use Long.rotateRight() or rotateLeft() *if* they were converted + // by the JVM into a native instruction. + // bits[word] &= Long.rotateLeft(0xfffffffe,bit); + } + + /** clears a bit. + * The index should be less than the OpenBitSet size. + */ + public void fastClear(long index) { + int wordNum = (int)(index >> 6); // div 64 + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] &= ~bitmask; + } + + /** clears a bit, allowing access beyond the current set size without changing the size.*/ + public void clear(long index) { + int wordNum = (int)(index >> 6); // div 64 + if (wordNum>=wlen) return; + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] &= ~bitmask; + } + + /** Clears a range of bits. Clearing past the end does not change the size of the set. + * + * @param startIndex lower index + * @param endIndex one-past the last bit to clear + */ + public void clear(long startIndex, long endIndex) { + if (endIndex <= startIndex) return; + + int startWord = (int)(startIndex>>6); + if (startWord >= wlen) return; + + // since endIndex is one past the end, this is index of the last + // word to be changed. + int endWord = (int)((endIndex-1)>>6); + + long startmask = -1L << startIndex; + long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap + + // invert masks since we are clearing + startmask = ~startmask; + endmask = ~endmask; + + if (startWord == endWord) { + bits[startWord] &= (startmask | endmask); + return; + } + + bits[startWord] &= startmask; + + int middle = Math.min(wlen, endWord); + Arrays.fill(bits, startWord+1, middle, 0L); + if (endWord < wlen) { + bits[endWord] &= endmask; + } + } + + + + /** Sets a bit and returns the previous value. + * The index should be less than the OpenBitSet size. + */ + public boolean getAndSet(int index) { + int wordNum = index >> 6; // div 64 + int bit = index & 0x3f; // mod 64 + long bitmask = 1L << bit; + boolean val = (bits[wordNum] & bitmask) != 0; + bits[wordNum] |= bitmask; + return val; + } + + /** Sets a bit and returns the previous value. + * The index should be less than the OpenBitSet size. + */ + public boolean getAndSet(long index) { + int wordNum = (int)(index >> 6); // div 64 + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + boolean val = (bits[wordNum] & bitmask) != 0; + bits[wordNum] |= bitmask; + return val; + } + + /** flips a bit. + * The index should be less than the OpenBitSet size. + */ + public void fastFlip(int index) { + int wordNum = index >> 6; // div 64 + int bit = index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] ^= bitmask; + } + + /** flips a bit. + * The index should be less than the OpenBitSet size. + */ + public void fastFlip(long index) { + int wordNum = (int)(index >> 6); // div 64 + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] ^= bitmask; + } + + /** flips a bit, expanding the set size if necessary */ + public void flip(long index) { + int wordNum = expandingWordNum(index); + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] ^= bitmask; + } + + /** flips a bit and returns the resulting bit value. + * The index should be less than the OpenBitSet size. + */ + public boolean flipAndGet(int index) { + int wordNum = index >> 6; // div 64 + int bit = index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] ^= bitmask; + return (bits[wordNum] & bitmask) != 0; + } + + /** flips a bit and returns the resulting bit value. + * The index should be less than the OpenBitSet size. + */ + public boolean flipAndGet(long index) { + int wordNum = (int)(index >> 6); // div 64 + int bit = (int)index & 0x3f; // mod 64 + long bitmask = 1L << bit; + bits[wordNum] ^= bitmask; + return (bits[wordNum] & bitmask) != 0; + } + + /** Flips a range of bits, expanding the set size if necessary + * + * @param startIndex lower index + * @param endIndex one-past the last bit to flip + */ + public void flip(long startIndex, long endIndex) { + if (endIndex <= startIndex) return; + int oldlen = wlen; + int startWord = (int)(startIndex>>6); + + // since endIndex is one past the end, this is index of the last + // word to be changed. + int endWord = expandingWordNum(endIndex-1); + + /*** Grrr, java shifting wraps around so -1L>>>64 == -1 + * for that reason, make sure not to use endmask if the bits to flip will + * be zero in the last word (redefine endWord to be the last changed...) + long startmask = -1L << (startIndex & 0x3f); // example: 11111...111000 + long endmask = -1L >>> (64-(endIndex & 0x3f)); // example: 00111...111111 + ***/ + + long startmask = -1L << startIndex; + long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap + + if (startWord == endWord) { + bits[startWord] ^= (startmask & endmask); + return; + } + + bits[startWord] ^= startmask; + + for (int i=startWord+1; i b.wlen) { + tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen); + } + return tot; + } + + /** Returns the popcount or cardinality of "a and not b" + * or "intersection(a, not(b))". + * Neither set is modified. + */ + public static long andNotCount(OpenBitSet a, OpenBitSet b) { + long tot = BitUtil.pop_andnot(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen)); + if (a.wlen > b.wlen) { + tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen); + } + return tot; + } + + /** Returns the popcount or cardinality of the exclusive-or of the two sets. + * Neither set is modified. + */ + public static long xorCount(OpenBitSet a, OpenBitSet b) { + long tot = BitUtil.pop_xor(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen)); + if (a.wlen < b.wlen) { + tot += BitUtil.pop_array(b.bits, a.wlen, b.wlen-a.wlen); + } else if (a.wlen > b.wlen) { + tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen); + } + return tot; + } + + + /** Returns the index of the first set bit starting at the index specified. + * -1 is returned if there are no more set bits. + */ + public int nextSetBit(int index) { + int i = index>>6; + if (i>=wlen) return -1; + int subIndex = index & 0x3f; // index within the word + long word = bits[i] >> subIndex; // skip all the bits to the right of index + + if (word!=0) { + return (i<<6) + subIndex + BitUtil.ntz(word); + } + + while(++i < wlen) { + word = bits[i]; + if (word!=0) return (i<<6) + BitUtil.ntz(word); + } + + return -1; + } + + /** Returns the index of the first set bit starting at the index specified. + * -1 is returned if there are no more set bits. + */ + public long nextSetBit(long index) { + int i = (int)(index>>>6); + if (i>=wlen) return -1; + int subIndex = (int)index & 0x3f; // index within the word + long word = bits[i] >>> subIndex; // skip all the bits to the right of index + + if (word!=0) { + return (((long)i)<<6) + (subIndex + BitUtil.ntz(word)); + } + + while(++i < wlen) { + word = bits[i]; + if (word!=0) return (((long)i)<<6) + BitUtil.ntz(word); + } + + return -1; + } + + + + + public Object clone() { + try { + OpenBitSet obs = (OpenBitSet)super.clone(); + obs.bits = (long[]) obs.bits.clone(); // hopefully an array clone is as fast(er) than arraycopy + return obs; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + /** this = this AND other */ + public void intersect(OpenBitSet other) { + int newLen= Math.min(this.wlen,other.wlen); + long[] thisArr = this.bits; + long[] otherArr = other.bits; + // testing against zero can be more efficient + int pos=newLen; + while(--pos>=0) { + thisArr[pos] &= otherArr[pos]; + } + if (this.wlen > newLen) { + // fill zeros from the new shorter length to the old length + Arrays.fill(bits,newLen,this.wlen,0); + } + this.wlen = newLen; + } + + /** this = this OR other */ + public void union(OpenBitSet other) { + int newLen = Math.max(wlen,other.wlen); + ensureCapacityWords(newLen); + + long[] thisArr = this.bits; + long[] otherArr = other.bits; + int pos=Math.min(wlen,other.wlen); + while(--pos>=0) { + thisArr[pos] |= otherArr[pos]; + } + if (this.wlen < newLen) { + System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen); + } + this.wlen = newLen; + } + + + /** Remove all elements set in other. this = this AND_NOT other */ + public void remove(OpenBitSet other) { + int idx = Math.min(wlen,other.wlen); + long[] thisArr = this.bits; + long[] otherArr = other.bits; + while(--idx>=0) { + thisArr[idx] &= ~otherArr[idx]; + } + } + + /** this = this XOR other */ + public void xor(OpenBitSet other) { + int newLen = Math.max(wlen,other.wlen); + ensureCapacityWords(newLen); + + long[] thisArr = this.bits; + long[] otherArr = other.bits; + int pos=Math.min(wlen,other.wlen); + while(--pos>=0) { + thisArr[pos] ^= otherArr[pos]; + } + if (this.wlen < newLen) { + System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen); + } + this.wlen = newLen; + } + + + // some BitSet compatability methods + + //** see {@link intersect} */ + public void and(OpenBitSet other) { + intersect(other); + } + + //** see {@link union} */ + public void or(OpenBitSet other) { + union(other); + } + + //** see {@link andNot} */ + public void andNot(OpenBitSet other) { + remove(other); + } + + /** returns true if the sets have any elements in common */ + public boolean intersects(OpenBitSet other) { + int pos = Math.min(this.wlen, other.wlen); + long[] thisArr = this.bits; + long[] otherArr = other.bits; + while (--pos>=0) { + if ((thisArr[pos] & otherArr[pos])!=0) return true; + } + return false; + } + + + + /** Expand the long[] with the size given as a number of words (64 bit longs). + * getNumWords() is unchanged by this call. + */ + public void ensureCapacityWords(int numWords) { + if (bits.length < numWords) { + long[] newBits = new long[numWords]; + System.arraycopy(bits,0,newBits,0,wlen); + bits = newBits; + } + } + + /** Ensure that the long[] is big enough to hold numBits, expanding it if necessary. + * getNumWords() is unchanged by this call. + */ + public void ensureCapacity(long numBits) { + ensureCapacityWords(bits2words(numBits)); + } + + /** Lowers numWords, the number of words in use, + * by checking for trailing zero words. + */ + public void trimTrailingZeros() { + int idx = wlen-1; + while (idx>=0 && bits[idx]==0) idx--; + wlen = idx+1; + } + + /** returns the number of 64 bit words it would take to hold numBits */ + public static int bits2words(long numBits) { + return (int)(((numBits-1)>>>6)+1); + } + + + /** returns true if both sets have the same bits set */ + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof OpenBitSet)) return false; + OpenBitSet a; + OpenBitSet b = (OpenBitSet)o; + // make a the larger set. + if (b.wlen > this.wlen) { + a = b; b=this; + } else { + a=this; + } + + // check for any set bits out of the range of b + for (int i=a.wlen-1; i>=b.wlen; i--) { + if (a.bits[i]!=0) return false; + } + + for (int i=b.wlen-1; i>=0; i--) { + if (a.bits[i] != b.bits[i]) return false; + } + + return true; + } + + + public int hashCode() { + long h = 0x98761234; // something non-zero for length==0 + for (int i = bits.length; --i>=0;) { + h ^= bits[i]; + h = (h << 1) | (h >>> 31); // rotate left + } + return (int)((h>>32) ^ h); // fold leftmost bits into right + } + +} + + diff --git a/src/java/org/apache/lucene/util/OpenBitSetIterator.java b/src/java/org/apache/lucene/util/OpenBitSetIterator.java new file mode 100644 index 00000000000..d64a0fb7ea4 --- /dev/null +++ b/src/java/org/apache/lucene/util/OpenBitSetIterator.java @@ -0,0 +1,173 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util; + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; + +/** An iterator to iterate over set bits in an OpenBitSet. + * This is faster than nextSetBit() for iterating over the complete set of bits, + * especially when the density of the bits set is high. + * + * @version $Id$ + */ +public class OpenBitSetIterator extends DocIdSetIterator { + + // The General Idea: instead of having an array per byte that has + // the offsets of the next set bit, that array could be + // packed inside a 32 bit integer (8 4 bit numbers). That + // should be faster than accessing an array for each index, and + // the total array size is kept smaller (256*sizeof(int))=1K + protected final static int[] bitlist={ + 0x0,0x1,0x2,0x21,0x3,0x31,0x32,0x321,0x4,0x41,0x42,0x421,0x43,0x431,0x432,0x4321,0x5,0x51,0x52,0x521,0x53,0x531,0x532,0x5321,0x54,0x541,0x542,0x5421,0x543,0x5431,0x5432,0x54321,0x6,0x61,0x62,0x621,0x63,0x631,0x632,0x6321,0x64,0x641,0x642,0x6421,0x643,0x6431,0x6432,0x64321,0x65,0x651,0x652,0x6521,0x653,0x6531,0x6532,0x65321,0x654,0x6541,0x6542,0x65421,0x6543,0x65431,0x65432,0x654321,0x7,0x71,0x72,0x721,0x73,0x731,0x732,0x7321,0x74,0x741,0x742,0x7421,0x743,0x7431,0x7432,0x74321,0x75,0x751,0x752,0x7521,0x753,0x7531,0x7532,0x75321,0x754,0x7541,0x7542,0x75421,0x7543,0x75431,0x75432,0x754321,0x76,0x761,0x762,0x7621,0x763,0x7631,0x7632,0x76321,0x764,0x7641,0x7642,0x76421,0x7643,0x76431,0x76432,0x764321,0x765,0x7651,0x7652,0x76521,0x7653,0x76531,0x76532,0x765321,0x7654,0x76541,0x76542,0x765421,0x76543,0x765431,0x765432,0x7654321,0x8,0x81,0x82,0x821,0x83,0x831,0x832,0x8321,0x84,0x841,0x842,0x8421,0x843,0x8431,0x8432,0x84321,0x85,0x851,0x852,0x8521,0x853,0x8531,0x8532,0x85321,0x854,0x8541,0x8542,0x85421,0x8543,0x85431,0x85432,0x854321,0x86,0x861,0x862,0x8621,0x863,0x8631,0x8632,0x86321,0x864,0x8641,0x8642,0x86421,0x8643,0x86431,0x86432,0x864321,0x865,0x8651,0x8652,0x86521,0x8653,0x86531,0x86532,0x865321,0x8654,0x86541,0x86542,0x865421,0x86543,0x865431,0x865432,0x8654321,0x87,0x871,0x872,0x8721,0x873,0x8731,0x8732,0x87321,0x874,0x8741,0x8742,0x87421,0x8743,0x87431,0x87432,0x874321,0x875,0x8751,0x8752,0x87521,0x8753,0x87531,0x87532,0x875321,0x8754,0x87541,0x87542,0x875421,0x87543,0x875431,0x875432,0x8754321,0x876,0x8761,0x8762,0x87621,0x8763,0x87631,0x87632,0x876321,0x8764,0x87641,0x87642,0x876421,0x87643,0x876431,0x876432,0x8764321,0x8765,0x87651,0x87652,0x876521,0x87653,0x876531,0x876532,0x8765321,0x87654,0x876541,0x876542,0x8765421,0x876543,0x8765431,0x8765432,0x87654321 + }; + /***** the python code that generated bitlist + def bits2int(val): + arr=0 + for shift in range(8,0,-1): + if val & 0x80: + arr = (arr << 4) | shift + val = val << 1 + return arr + + def int_table(): + tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ] + return ','.join(tbl) + ******/ + + // hmmm, what about an iterator that finds zeros though, + // or a reverse iterator... should they be separate classes + // for efficiency, or have a common root interface? (or + // maybe both? could ask for a SetBitsIterator, etc... + + + private final long[] arr; + private final int words; + private int i=-1; + private long word; + private int wordShift; + private int indexArray; + private int curDocId; + + public OpenBitSetIterator(OpenBitSet obs) { + this(obs.getBits(), obs.getNumWords()); + } + + public OpenBitSetIterator(long[] bits, int numWords) { + arr = bits; + words = numWords; + } + + // 64 bit shifts + private void shift() { + if ((int)word ==0) {wordShift +=32; word = word >>>32; } + if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; } + if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; } + indexArray = bitlist[(int)word & 0xff]; + } + + /***** alternate shift implementations + // 32 bit shifts, but a long shift needed at the end + private void shift2() { + int y = (int)word; + if (y==0) {wordShift +=32; y = (int)(word >>>32); } + if ((y & 0x0000FFFF) == 0) { wordShift +=16; y>>>=16; } + if ((y & 0x000000FF) == 0) { wordShift +=8; y>>>=8; } + indexArray = bitlist[y & 0xff]; + word >>>= (wordShift +1); + } + + private void shift3() { + int lower = (int)word; + int lowByte = lower & 0xff; + if (lowByte != 0) { + indexArray=bitlist[lowByte]; + return; + } + shift(); + } + ******/ + + public boolean next() { + if (indexArray==0) { + if (word!=0) { + word >>>= 8; + wordShift += 8; + } + + while (word==0) { + if (++i >= words) { + curDocId = -1; + return false; + } + word = arr[i]; + wordShift =-1; // loop invariant code motion should move this + } + + // after the first time, should I go with a linear search, or + // stick with the binary search in shift? + shift(); + } + + int bitIndex = (indexArray & 0x0f) + wordShift; + indexArray >>>= 4; + // should i<<6 be cached as a separate variable? + // it would only save one cycle in the best circumstances. + curDocId = (i<<6) + bitIndex; + return true; + } + + public boolean skipTo(int target) { + indexArray=0; + i = target >> 6; + if (i>=words) { + word =0; // setup so next() will also return -1 + curDocId = -1; + return false; + } + wordShift = target & 0x3f; + word = arr[i] >>> wordShift; + if (word !=0) { + wordShift--; // compensate for 1 based arrIndex + } else { + while (word ==0) { + if (++i >= words) { + curDocId = -1; + return false; + } + word = arr[i]; + } + wordShift =-1; + } + + shift(); + + int bitIndex = (indexArray & 0x0f) + wordShift; + indexArray >>>= 4; + // should i<<6 be cached as a separate variable? + // it would only save one cycle in the best circumstances. + curDocId = (i<<6) + bitIndex; + return true; + } + + public int doc() { + return this.curDocId; + } + +} diff --git a/src/java/org/apache/lucene/util/SortedVIntList.java b/src/java/org/apache/lucene/util/SortedVIntList.java new file mode 100644 index 00000000000..54dcc0ee41a --- /dev/null +++ b/src/java/org/apache/lucene/util/SortedVIntList.java @@ -0,0 +1,218 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.BitSet; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + +/** + * Store and iterate sorted integers in compressed form in RAM. + *
The code for compressing the differences between ascending integers was + * borrowed from {@link org.apache.lucene.store.IndexInput} and + * {@link org.apache.lucene.store.IndexOutput}. + */ +public class SortedVIntList extends DocIdSet { + /** When a BitSet has fewer than 1 in BITS2VINTLIST_SIZE bits set, + * a SortedVIntList representing the index numbers of the set bits + * will be smaller than that BitSet. + */ + final static int BITS2VINTLIST_SIZE = 8; + + private int size; + private byte[] bytes; + private int lastBytePos; + + /** + * Create a SortedVIntList from all elements of an array of integers. + * + * @param sortedInts A sorted array of non negative integers. + */ + public SortedVIntList(int[] sortedInts) { + this(sortedInts, sortedInts.length); + } + + /** + * Create a SortedVIntList from an array of integers. + * @param sortedInts An array of sorted non negative integers. + * @param inputSize The number of integers to be used from the array. + */ + public SortedVIntList(int[] sortedInts, int inputSize) { + SortedVIntListBuilder builder = new SortedVIntListBuilder(); + for (int i = 0; i < inputSize; i++) { + builder.addInt(sortedInts[i]); + } + builder.done(); + } + + /** + * Create a SortedVIntList from a BitSet. + * @param bits A bit set representing a set of integers. + */ + public SortedVIntList(BitSet bits) { + SortedVIntListBuilder builder = new SortedVIntListBuilder(); + int nextInt = bits.nextSetBit(0); + while (nextInt != -1) { + builder.addInt(nextInt); + nextInt = bits.nextSetBit(nextInt + 1); + } + builder.done(); + } + + /** + * Create a SortedVIntList from an OpenBitSet. + * @param bits A bit set representing a set of integers. + */ + public SortedVIntList(OpenBitSet bits) { + SortedVIntListBuilder builder = new SortedVIntListBuilder(); + int nextInt = bits.nextSetBit(0); + while (nextInt != -1) { + builder.addInt(nextInt); + nextInt = bits.nextSetBit(nextInt + 1); + } + builder.done(); + } + + /** + * Create a SortedVIntList. + * @param docIdSetIterator An iterator providing document numbers as a set of integers. + * This DocIdSetIterator is iterated completely when this constructor + * is called and it must provide the integers in non + * decreasing order. + */ + public SortedVIntList(DocIdSetIterator docIdSetIterator) throws IOException { + SortedVIntListBuilder builder = new SortedVIntListBuilder(); + while (docIdSetIterator.next()) { + builder.addInt(docIdSetIterator.doc()); + } + builder.done(); + } + + + private class SortedVIntListBuilder { + private int lastInt = 0; + + SortedVIntListBuilder() { + initBytes(); + lastInt = 0; + } + + void addInt(int nextInt) { + int diff = nextInt - lastInt; + if (diff < 0) { + throw new IllegalArgumentException( + "Input not sorted or first element negative."); + } + + if ((lastBytePos + MAX_BYTES_PER_INT) > bytes.length) { + // biggest possible int does not fit + resizeBytes((bytes.length * 2) + MAX_BYTES_PER_INT); + } + + // See org.apache.lucene.store.IndexOutput.writeVInt() + while ((diff & ~VB1) != 0) { // The high bit of the next byte needs to be set. + bytes[lastBytePos++] = (byte) ((diff & VB1) | ~VB1); + diff >>>= BIT_SHIFT; + } + bytes[lastBytePos++] = (byte) diff; // Last byte, high bit not set. + size++; + lastInt = nextInt; + } + + void done() { + resizeBytes(lastBytePos); + } + } + + + private void initBytes() { + size = 0; + bytes = new byte[128]; // initial byte size + lastBytePos = 0; + } + + private void resizeBytes(int newSize) { + if (newSize != bytes.length) { + byte[] newBytes = new byte[newSize]; + System.arraycopy(bytes, 0, newBytes, 0, lastBytePos); + bytes = newBytes; + } + } + + private static final int VB1 = 0x7F; + private static final int BIT_SHIFT = 7; + private final int MAX_BYTES_PER_INT = (31 / BIT_SHIFT) + 1; + + /** + * @return The total number of sorted integers. + */ + public int size() { + return size; + } + + /** + * @return The size of the byte array storing the compressed sorted integers. + */ + public int getByteSize() { + return bytes.length; + } + + /** + * @return An iterator over the sorted integers. + */ + public DocIdSetIterator iterator() { + return new DocIdSetIterator() { + int bytePos = 0; + int lastInt = 0; + + private void advance() { + // See org.apache.lucene.store.IndexInput.readVInt() + byte b = bytes[bytePos++]; + lastInt += b & VB1; + for (int s = BIT_SHIFT; (b & ~VB1) != 0; s += BIT_SHIFT) { + b = bytes[bytePos++]; + lastInt += (b & VB1) << s; + } + } + + public int doc() {return lastInt;} + + public boolean next() { + if (bytePos >= lastBytePos) { + return false; + } else { + advance(); + return true; + } + } + + public boolean skipTo(int docNr) { + while (bytePos < lastBytePos) { + advance(); + if (lastInt >= docNr) { // No skipping to docNr available. + return true; + } + } + return false; + } + }; + } +} + diff --git a/src/test/org/apache/lucene/search/CachingWrapperFilterHelper.java b/src/test/org/apache/lucene/search/CachingWrapperFilterHelper.java index 4783b2f4e99..7019e79f155 100644 --- a/src/test/org/apache/lucene/search/CachingWrapperFilterHelper.java +++ b/src/test/org/apache/lucene/search/CachingWrapperFilterHelper.java @@ -43,13 +43,13 @@ public class CachingWrapperFilterHelper extends CachingWrapperFilter { this.shouldHaveCache = shouldHaveCache; } - public BitSet bits(IndexReader reader) throws IOException { + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { if (cache == null) { cache = new WeakHashMap(); } synchronized (cache) { // check cache - BitSet cached = (BitSet) cache.get(reader); + DocIdSet cached = (DocIdSet) cache.get(reader); if (shouldHaveCache) { TestCase.assertNotNull("Cache should have data ", cached); } else { @@ -60,7 +60,7 @@ public class CachingWrapperFilterHelper extends CachingWrapperFilter { } } - final BitSet bits = filter.bits(reader); + final DocIdSet bits = filter.getDocIdSet(reader); synchronized (cache) { // update cache cache.put(reader, bits); diff --git a/src/test/org/apache/lucene/search/MockFilter.java b/src/test/org/apache/lucene/search/MockFilter.java index 4f36fce01cc..0ec7258e310 100644 --- a/src/test/org/apache/lucene/search/MockFilter.java +++ b/src/test/org/apache/lucene/search/MockFilter.java @@ -18,14 +18,15 @@ package org.apache.lucene.search; */ import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.DocIdBitSet; import java.util.BitSet; public class MockFilter extends Filter { private boolean wasCalled; - public BitSet bits(IndexReader reader) { + public DocIdSet getDocIdSet(IndexReader reader) { wasCalled = true; - return new BitSet(); + return new DocIdBitSet(new BitSet()); } public void clear() { diff --git a/src/test/org/apache/lucene/search/RemoteCachingWrapperFilterHelper.java b/src/test/org/apache/lucene/search/RemoteCachingWrapperFilterHelper.java index 859bac0a66f..257a51c1e8a 100644 --- a/src/test/org/apache/lucene/search/RemoteCachingWrapperFilterHelper.java +++ b/src/test/org/apache/lucene/search/RemoteCachingWrapperFilterHelper.java @@ -42,7 +42,7 @@ public class RemoteCachingWrapperFilterHelper extends RemoteCachingWrapperFilter this.shouldHaveCache = shouldHaveCache; } - public BitSet bits(IndexReader reader) throws IOException { + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { Filter cachedFilter = FilterManager.getInstance().getFilter(filter); TestCase.assertNotNull("Filter should not be null", cachedFilter); @@ -55,6 +55,6 @@ public class RemoteCachingWrapperFilterHelper extends RemoteCachingWrapperFilter if (filter instanceof CachingWrapperFilterHelper) { ((CachingWrapperFilterHelper)cachedFilter).setShouldHaveCache(shouldHaveCache); } - return cachedFilter.bits(reader); + return cachedFilter.getDocIdSet(reader); } } diff --git a/src/test/org/apache/lucene/search/SingleDocTestFilter.java b/src/test/org/apache/lucene/search/SingleDocTestFilter.java index c1d8b1ba093..b429f7025be 100644 --- a/src/test/org/apache/lucene/search/SingleDocTestFilter.java +++ b/src/test/org/apache/lucene/search/SingleDocTestFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.DocIdBitSet; import java.util.BitSet; import java.io.IOException; @@ -29,9 +30,9 @@ public class SingleDocTestFilter extends Filter { this.doc = doc; } - public BitSet bits(IndexReader reader) throws IOException { + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); bits.set(doc); - return bits; + return new DocIdBitSet(bits); } } diff --git a/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java b/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java index ae552d76f2a..9a4c7d03b99 100644 --- a/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java +++ b/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java @@ -36,12 +36,12 @@ public class TestCachingWrapperFilter extends LuceneTestCase { CachingWrapperFilter cacher = new CachingWrapperFilter(filter); // first time, nested filter is called - cacher.bits(reader); + cacher.getDocIdSet(reader); assertTrue("first time", filter.wasCalled()); // second time, nested filter should not be called filter.clear(); - cacher.bits(reader); + cacher.getDocIdSet(reader); assertFalse("second time", filter.wasCalled()); reader.close(); diff --git a/src/test/org/apache/lucene/search/TestExplanations.java b/src/test/org/apache/lucene/search/TestExplanations.java index ed06916ca16..4840325b845 100644 --- a/src/test/org/apache/lucene/search/TestExplanations.java +++ b/src/test/org/apache/lucene/search/TestExplanations.java @@ -33,6 +33,7 @@ import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.DocIdBitSet; import java.util.Random; import java.util.BitSet; @@ -122,12 +123,12 @@ public class TestExplanations extends LuceneTestCase { public ItemizedFilter(int[] docs) { this.docs = docs; } - public BitSet bits(IndexReader r) { + public DocIdSet getDocIdSet(IndexReader r) { BitSet b = new BitSet(r.maxDoc()); for (int i = 0; i < docs.length; i++) { b.set(docs[i]); } - return b; + return new DocIdBitSet(b); } } diff --git a/src/test/org/apache/lucene/search/TestFilteredQuery.java b/src/test/org/apache/lucene/search/TestFilteredQuery.java index b5b1ae8af90..c4be79511fb 100644 --- a/src/test/org/apache/lucene/search/TestFilteredQuery.java +++ b/src/test/org/apache/lucene/search/TestFilteredQuery.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.DocIdBitSet; import java.util.BitSet; @@ -82,11 +83,11 @@ extends LuceneTestCase { // must be static for serialization tests private static Filter newStaticFilterB() { return new Filter() { - public BitSet bits (IndexReader reader) { + public DocIdSet getDocIdSet (IndexReader reader) { BitSet bitset = new BitSet(5); bitset.set (1); bitset.set (3); - return bitset; + return new DocIdBitSet(bitset); } }; } @@ -150,10 +151,10 @@ extends LuceneTestCase { // must be static for serialization tests private static Filter newStaticFilterA() { return new Filter() { - public BitSet bits (IndexReader reader) { + public DocIdSet getDocIdSet (IndexReader reader) { BitSet bitset = new BitSet(5); bitset.set(0, 5); - return bitset; + return new DocIdBitSet(bitset); } }; } @@ -200,3 +201,4 @@ extends LuceneTestCase { } + diff --git a/src/test/org/apache/lucene/search/TestRemoteCachingWrapperFilter.java b/src/test/org/apache/lucene/search/TestRemoteCachingWrapperFilter.java index 63bf0af9a3e..63779c9b50b 100644 --- a/src/test/org/apache/lucene/search/TestRemoteCachingWrapperFilter.java +++ b/src/test/org/apache/lucene/search/TestRemoteCachingWrapperFilter.java @@ -91,7 +91,7 @@ public class TestRemoteCachingWrapperFilter extends LuceneTestCase { public void testTermRemoteFilter() throws Exception { - CachingWrapperFilterHelper cwfh = new CachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "a")))); + CachingWrapperFilterHelper cwfh = new CachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "a")))); // This is what we are fixing - if one uses a CachingWrapperFilter(Helper) it will never // cache the filter on the remote site @@ -112,16 +112,16 @@ public class TestRemoteCachingWrapperFilter extends LuceneTestCase { // assert that we get the same cached Filter, even if we create a new instance of RemoteCachingWrapperFilter(Helper) // this should pass because the Filter parameters are the same, and the cache uses Filter's hashCode() as cache keys, // and Filters' hashCode() builds on Filter parameters, not the Filter instance itself - rcwfh = new RemoteCachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "a"))), false); + rcwfh = new RemoteCachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "a"))), false); rcwfh.shouldHaveCache(false); search(new TermQuery(new Term("test", "test")), rcwfh, 0, "A"); - rcwfh = new RemoteCachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "a"))), false); + rcwfh = new RemoteCachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "a"))), false); rcwfh.shouldHaveCache(true); search(new TermQuery(new Term("test", "test")), rcwfh, 0, "A"); // assert that we get a non-cached version of the Filter because this is a new Query (type:b) - rcwfh = new RemoteCachingWrapperFilterHelper(new QueryFilter(new TermQuery(new Term("type", "b"))), false); + rcwfh = new RemoteCachingWrapperFilterHelper(new QueryWrapperFilter(new TermQuery(new Term("type", "b"))), false); rcwfh.shouldHaveCache(false); search(new TermQuery(new Term("type", "b")), rcwfh, 0, "B"); } diff --git a/src/test/org/apache/lucene/search/TestRemoteSearchable.java b/src/test/org/apache/lucene/search/TestRemoteSearchable.java index 63b463c3304..d1e0f2b1948 100644 --- a/src/test/org/apache/lucene/search/TestRemoteSearchable.java +++ b/src/test/org/apache/lucene/search/TestRemoteSearchable.java @@ -116,11 +116,11 @@ public class TestRemoteSearchable extends LuceneTestCase { Searcher searcher = new MultiSearcher(searchables); Hits hits = searcher.search( new TermQuery(new Term("test", "text")), - new QueryFilter(new TermQuery(new Term("test", "test")))); + new QueryWrapperFilter(new TermQuery(new Term("test", "test")))); assertEquals(1, hits.length()); Hits nohits = searcher.search( new TermQuery(new Term("test", "text")), - new QueryFilter(new TermQuery(new Term("test", "non-existent-term")))); + new QueryWrapperFilter(new TermQuery(new Term("test", "non-existent-term")))); assertEquals(0, nohits.length()); } @@ -129,7 +129,7 @@ public class TestRemoteSearchable extends LuceneTestCase { Searchable[] searchables = { getRemote() }; Searcher searcher = new MultiSearcher(searchables); Hits hits = searcher.search( - new ConstantScoreQuery(new QueryFilter( + new ConstantScoreQuery(new QueryWrapperFilter( new TermQuery(new Term("test", "test"))))); assertEquals(1, hits.length()); } diff --git a/src/test/org/apache/lucene/search/TestScorerPerf.java b/src/test/org/apache/lucene/search/TestScorerPerf.java index ad180da8e75..6f5dcdcb259 100755 --- a/src/test/org/apache/lucene/search/TestScorerPerf.java +++ b/src/test/org/apache/lucene/search/TestScorerPerf.java @@ -1,5 +1,6 @@ package org.apache.lucene.search; +import org.apache.lucene.util.DocIdBitSet; import org.apache.lucene.util.LuceneTestCase; import java.util.Random; @@ -95,16 +96,6 @@ public class TestScorerPerf extends LuceneTestCase { return sets; } - public static class BitSetFilter extends Filter { - public BitSet set; - public BitSetFilter(BitSet set) { - this.set = set; - } - public BitSet bits(IndexReader reader) throws IOException { - return set; - } - } - public static class CountingHitCollector extends HitCollector { int count=0; int sum=0; @@ -137,8 +128,12 @@ public class TestScorerPerf extends LuceneTestCase { BitSet addClause(BooleanQuery bq, BitSet result) { - BitSet rnd = sets[r.nextInt(sets.length)]; - Query q = new ConstantScoreQuery(new BitSetFilter(rnd)); + final BitSet rnd = sets[r.nextInt(sets.length)]; + Query q = new ConstantScoreQuery(new Filter() { + public DocIdSet getDocIdSet(IndexReader reader) { + return new DocIdBitSet(rnd); + }; + }); bq.add(q, BooleanClause.Occur.MUST); if (validate) { if (result==null) result = (BitSet)rnd.clone(); diff --git a/src/test/org/apache/lucene/search/TestSort.java b/src/test/org/apache/lucene/search/TestSort.java index bb8ac51a570..8e021cc7cde 100644 --- a/src/test/org/apache/lucene/search/TestSort.java +++ b/src/test/org/apache/lucene/search/TestSort.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.DocIdBitSet; import java.io.IOException; import java.io.Serializable; @@ -571,10 +572,10 @@ implements Serializable { // a filter that only allows through the first hit Filter filt = new Filter() { - public BitSet bits(IndexReader reader) throws IOException { + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { BitSet bs = new BitSet(reader.maxDoc()); bs.set(docs1.scoreDocs[0].doc); - return bs; + return new DocIdBitSet(bs); } }; diff --git a/src/test/org/apache/lucene/search/TestSpanQueryFilter.java b/src/test/org/apache/lucene/search/TestSpanQueryFilter.java index 17311bab170..7d8ce1e01a0 100644 --- a/src/test/org/apache/lucene/search/TestSpanQueryFilter.java +++ b/src/test/org/apache/lucene/search/TestSpanQueryFilter.java @@ -56,20 +56,36 @@ public class TestSpanQueryFilter extends LuceneTestCase { SpanTermQuery query = new SpanTermQuery(new Term("field", English.intToEnglish(10).trim())); SpanQueryFilter filter = new SpanQueryFilter(query); SpanFilterResult result = filter.bitSpans(reader); - BitSet bits = result.getBits(); - assertTrue("bits is null and it shouldn't be", bits != null); - assertTrue("tenth bit is not on", bits.get(10)); + DocIdSet docIdSet = result.getDocIdSet(); + assertTrue("docIdSet is null and it shouldn't be", docIdSet != null); + assertContainsDocId("docIdSet doesn't contain docId 10", docIdSet, 10); List spans = result.getPositions(); assertTrue("spans is null and it shouldn't be", spans != null); - assertTrue("spans Size: " + spans.size() + " is not: " + bits.cardinality(), spans.size() == bits.cardinality()); + int size = getDocIdSetSize(docIdSet); + assertTrue("spans Size: " + spans.size() + " is not: " + size, spans.size() == size); for (Iterator iterator = spans.iterator(); iterator.hasNext();) { SpanFilterResult.PositionInfo info = (SpanFilterResult.PositionInfo) iterator.next(); assertTrue("info is null and it shouldn't be", info != null); //The doc should indicate the bit is on - assertTrue("Bit is not on and it should be", bits.get(info.getDoc())); + assertContainsDocId("docIdSet doesn't contain docId " + info.getDoc(), docIdSet, info.getDoc()); //There should be two positions in each assertTrue("info.getPositions() Size: " + info.getPositions().size() + " is not: " + 2, info.getPositions().size() == 2); } reader.close(); } + + int getDocIdSetSize(DocIdSet docIdSet) throws Exception { + int size = 0; + DocIdSetIterator it = docIdSet.iterator(); + while (it.next()) { + size++; + } + return size; + } + + public void assertContainsDocId(String msg, DocIdSet docIdSet, int docId) throws Exception { + DocIdSetIterator it = docIdSet.iterator(); + assertTrue(msg, it.skipTo(docId)); + assertTrue(msg, it.doc() == docId); + } } diff --git a/src/test/org/apache/lucene/util/TestOpenBitSet.java b/src/test/org/apache/lucene/util/TestOpenBitSet.java new file mode 100644 index 00000000000..33fa7a12b46 --- /dev/null +++ b/src/test/org/apache/lucene/util/TestOpenBitSet.java @@ -0,0 +1,203 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util; + +import junit.framework.TestCase; + +import java.util.Random; +import java.util.BitSet; + +/** + * @version $Id$ + */ +public class TestOpenBitSet extends TestCase { + static Random rand = new Random(); + + void doGet(BitSet a, OpenBitSet b) { + int max = a.size(); + for (int i=0; i=0); + } + + // test interleaving different BitSetIterator.next() + void doIterate(BitSet a, OpenBitSet b) { + int aa=-1,bb=-1; + OpenBitSetIterator iterator = new OpenBitSetIterator(b); + do { + aa = a.nextSetBit(aa+1); + if (rand.nextBoolean()) + iterator.next(); + else + iterator.skipTo(bb+1); + bb = iterator.doc(); + assertEquals(aa,bb); + } while (aa>=0); + } + + + void doRandomSets(int maxSize, int iter) { + BitSet a0=null; + OpenBitSet b0=null; + + for (int i=0; i0) { + int nOper = rand.nextInt(sz); + for (int j=0; j>1)+1); + BitSet aa = (BitSet)a.clone(); aa.flip(fromIndex,toIndex); + OpenBitSet bb = (OpenBitSet)b.clone(); bb.flip(fromIndex,toIndex); + + doIterate(aa,bb); // a problem here is from flip or doIterate + + fromIndex = rand.nextInt(sz+80); + toIndex = fromIndex + rand.nextInt((sz>>1)+1); + aa = (BitSet)a.clone(); aa.clear(fromIndex,toIndex); + bb = (OpenBitSet)b.clone(); bb.clear(fromIndex,toIndex); + + doNextSetBit(aa,bb); // a problem here is from clear() or nextSetBit + + fromIndex = rand.nextInt(sz+80); + toIndex = fromIndex + rand.nextInt((sz>>1)+1); + aa = (BitSet)a.clone(); aa.set(fromIndex,toIndex); + bb = (OpenBitSet)b.clone(); bb.set(fromIndex,toIndex); + + doNextSetBit(aa,bb); // a problem here is from set() or nextSetBit + + + if (a0 != null) { + assertEquals( a.equals(a0), b.equals(b0)); + + assertEquals(a.cardinality(), b.cardinality()); + + BitSet a_and = (BitSet)a.clone(); a_and.and(a0); + BitSet a_or = (BitSet)a.clone(); a_or.or(a0); + BitSet a_xor = (BitSet)a.clone(); a_xor.xor(a0); + BitSet a_andn = (BitSet)a.clone(); a_andn.andNot(a0); + + OpenBitSet b_and = (OpenBitSet)b.clone(); assertEquals(b,b_and); b_and.and(b0); + OpenBitSet b_or = (OpenBitSet)b.clone(); b_or.or(b0); + OpenBitSet b_xor = (OpenBitSet)b.clone(); b_xor.xor(b0); + OpenBitSet b_andn = (OpenBitSet)b.clone(); b_andn.andNot(b0); + + doIterate(a_and,b_and); + doIterate(a_or,b_or); + doIterate(a_xor,b_xor); + doIterate(a_andn,b_andn); + + assertEquals(a_and.cardinality(), b_and.cardinality()); + assertEquals(a_or.cardinality(), b_or.cardinality()); + assertEquals(a_xor.cardinality(), b_xor.cardinality()); + assertEquals(a_andn.cardinality(), b_andn.cardinality()); + + // test non-mutating popcounts + assertEquals(b_and.cardinality(), OpenBitSet.intersectionCount(b,b0)); + assertEquals(b_or.cardinality(), OpenBitSet.unionCount(b,b0)); + assertEquals(b_xor.cardinality(), OpenBitSet.xorCount(b,b0)); + assertEquals(b_andn.cardinality(), OpenBitSet.andNotCount(b,b0)); + } + + a0=a; + b0=b; + } + } + + // large enough to flush obvious bugs, small enough to run in <.5 sec as part of a + // larger testsuite. + public void testSmall() { + doRandomSets(1200,1000); + } + + public void testBig() { + // uncomment to run a bigger test (~2 minutes). + // doRandomSets(2000,200000); + } + + public void testEquals() { + OpenBitSet b1 = new OpenBitSet(1111); + OpenBitSet b2 = new OpenBitSet(2222); + assertTrue(b1.equals(b2)); + assertTrue(b2.equals(b1)); + b1.set(10); + assertFalse(b1.equals(b2)); + assertFalse(b2.equals(b1)); + b2.set(10); + assertTrue(b1.equals(b2)); + assertTrue(b2.equals(b1)); + b2.set(2221); + assertFalse(b1.equals(b2)); + assertFalse(b2.equals(b1)); + b1.set(2221); + assertTrue(b1.equals(b2)); + assertTrue(b2.equals(b1)); + + // try different type of object + assertFalse(b1.equals(new Object())); + } + +} + + + diff --git a/src/test/org/apache/lucene/util/TestSortedVIntList.java b/src/test/org/apache/lucene/util/TestSortedVIntList.java new file mode 100644 index 00000000000..6ef92f3886e --- /dev/null +++ b/src/test/org/apache/lucene/util/TestSortedVIntList.java @@ -0,0 +1,198 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.BitSet; + +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +import org.apache.lucene.search.DocIdSetIterator; + +public class TestSortedVIntList extends TestCase { + /** Main for running test case by itself. */ + public static void main(String args[]) { + TestRunner.run(new TestSuite(TestSortedVIntList.class)); + } + + void tstIterator ( + SortedVIntList vintList, + int[] ints) throws IOException { + for (int i = 0; i < ints.length; i++) { + if ((i > 0) && (ints[i-1] == ints[i])) { + return; // DocNrSkipper should not skip to same document. + } + } + DocIdSetIterator m = vintList.iterator(); + for (int i = 0; i < ints.length; i++) { + assertTrue("No end of Matcher at: " + i, m.next()); + assertEquals(ints[i], m.doc()); + } + assertTrue("End of Matcher", (! m.next())); + } + + void tstVIntList( + SortedVIntList vintList, + int[] ints, + int expectedByteSize) throws IOException { + assertEquals("Size", ints.length, vintList.size()); + assertEquals("Byte size", expectedByteSize, vintList.getByteSize()); + tstIterator(vintList, ints); + } + + public void tstViaBitSet(int [] ints, int expectedByteSize) throws IOException { + final int MAX_INT_FOR_BITSET = 1024 * 1024; + BitSet bs = new BitSet(); + for (int i = 0; i < ints.length; i++) { + if (ints[i] > MAX_INT_FOR_BITSET) { + return; // BitSet takes too much memory + } + if ((i > 0) && (ints[i-1] == ints[i])) { + return; // BitSet cannot store duplicate. + } + bs.set(ints[i]); + } + SortedVIntList svil = new SortedVIntList(bs); + tstVIntList(svil, ints, expectedByteSize); + tstVIntList(new SortedVIntList(svil.iterator()), ints, expectedByteSize); + } + + private static final int VB1 = 0x7F; + private static final int BIT_SHIFT = 7; + private static final int VB2 = (VB1 << BIT_SHIFT) | VB1; + private static final int VB3 = (VB2 << BIT_SHIFT) | VB1; + private static final int VB4 = (VB3 << BIT_SHIFT) | VB1; + + private int vIntByteSize(int i) { + assert i >= 0; + if (i <= VB1) return 1; + if (i <= VB2) return 2; + if (i <= VB3) return 3; + if (i <= VB4) return 4; + return 5; + } + + private int vIntListByteSize(int [] ints) { + int byteSize = 0; + int last = 0; + for (int i = 0; i < ints.length; i++) { + byteSize += vIntByteSize(ints[i] - last); + last = ints[i]; + } + return byteSize; + } + + public void tstInts(int [] ints) { + int expectedByteSize = vIntListByteSize(ints); + try { + tstVIntList(new SortedVIntList(ints), ints, expectedByteSize); + tstViaBitSet(ints, expectedByteSize); + } catch (IOException ioe) { + throw new Error(ioe); + } + } + + public void tstIllegalArgExc(int [] ints) { + try { + new SortedVIntList(ints); + } + catch (IllegalArgumentException e) { + return; + } + fail("Expected IllegalArgumentException"); + } + + private int[] fibArray(int a, int b, int size) { + final int[] fib = new int[size]; + fib[0] = a; + fib[1] = b; + for (int i = 2; i < size; i++) { + fib[i] = fib[i-1] + fib[i-2]; + } + return fib; + } + + private int[] reverseDiffs(int []ints) { // reverse the order of the successive differences + final int[] res = new int[ints.length]; + for (int i = 0; i < ints.length; i++) { + res[i] = ints[ints.length - 1] + (ints[0] - ints[ints.length - 1 - i]); + } + return res; + } + + public void test01() { + tstInts(new int[] {}); + } + public void test02() { + tstInts(new int[] {0}); + } + public void test03() { + tstInts(new int[] {0,Integer.MAX_VALUE}); + } + public void test04a() { + tstInts(new int[] {0, VB2 - 1}); + } + public void test04b() { + tstInts(new int[] {0, VB2}); + } + public void test04c() { + tstInts(new int[] {0, VB2 + 1}); + } + public void test05() { + tstInts(fibArray(0,1,7)); // includes duplicate value 1 + } + public void test05b() { + tstInts(reverseDiffs(fibArray(0,1,7))); + } + public void test06() { + tstInts(fibArray(1,2,45)); // no duplicates, size 46 exceeds max int. + } + public void test06b() { + tstInts(reverseDiffs(fibArray(1,2,45))); + } + public void test07a() { + tstInts(new int[] {0, VB3}); + } + public void test07b() { + tstInts(new int[] {1, VB3 + 2}); + } + public void test07c() { + tstInts(new int[] {2, VB3 + 4}); + } + public void test08a() { + tstInts(new int[] {0, VB4 + 1}); + } + public void test08b() { + tstInts(new int[] {1, VB4 + 1}); + } + public void test08c() { + tstInts(new int[] {2, VB4 + 1}); + } + + public void test10() { + tstIllegalArgExc(new int[] {-1}); + } + public void test11() { + tstIllegalArgExc(new int[] {1,0}); + } + public void test12() { + tstIllegalArgExc(new int[] {0,1,1,2,3,5,8,0}); + } +}