From 8f96faf3d33e8b21e1324cfcaef6fb4732e6804e Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 30 Sep 2014 11:52:23 +0000 Subject: [PATCH] LUCENE-5938: Add a new sparse fixed bit set and remove ConstantScoreAutoRewrite. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1628402 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 7 + .../search/ConstantScoreAutoRewrite.java | 220 ----------- .../apache/lucene/search/MultiTermQuery.java | 46 +-- .../search/MultiTermQueryWrapperFilter.java | 33 +- .../lucene/search/NumericRangeQuery.java | 2 +- .../org/apache/lucene/search/PrefixQuery.java | 2 +- .../apache/lucene/search/ScoringRewrite.java | 2 +- .../apache/lucene/search/TermRangeQuery.java | 2 +- .../apache/lucene/search/WildcardQuery.java | 2 +- .../apache/lucene/util/DocIdSetBuilder.java | 94 +++++ .../apache/lucene/util/SparseFixedBitSet.java | 341 ++++++++++++++++++ .../apache/lucene/index/TestReaderClosed.java | 11 +- .../lucene/search/TestAutomatonQuery.java | 3 - .../search/TestAutomatonQueryUnicode.java | 3 - .../search/TestMultiTermConstantScore.java | 32 +- .../search/TestMultiTermQueryRewrites.java | 7 - .../apache/lucene/search/TestWildcard.java | 6 - .../lucene/util/TestDocIdSetBuilder.java | 92 +++++ .../lucene/util/TestSparseFixedBitSet.java | 81 +++++ .../apache/lucene/queries/TermsFilter.java | 40 +- .../lucene/queries/TermsFilterTest.java | 4 +- .../queryparser/classic/QueryParserBase.java | 4 +- .../CommonQueryParserConfiguration.java | 2 +- .../standard/StandardQueryParser.java | 2 +- .../config/StandardQueryConfigHandler.java | 2 +- .../MultiTermRewriteMethodProcessor.java | 2 +- .../flexible/standard/TestQPHelper.java | 10 +- .../queryparser/util/QueryParserTestBase.java | 4 +- .../sandbox/queries/DuplicateFilter.java | 16 +- .../queries/SlowCollatedTermRangeQuery.java | 2 +- .../lucene/util/BaseDocIdSetTestCase.java | 6 +- .../solr/parser/SolrQueryParserBase.java | 4 +- .../org/apache/solr/schema/FieldType.java | 2 +- .../solr/update/AddBlockUpdateTest.java | 6 +- 34 files changed, 718 insertions(+), 374 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/TestSparseFixedBitSet.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7f61a1a9b32..ea10bbbc35c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -136,6 +136,9 @@ API Changes * LUCENE-5569: *AtomicReader/AtomicReaderContext have been renamed to *LeafReader/LeafReaderContext. (Ryan Ernst) +* LUCENE-5938: Removed MultiTermQuery.ConstantScoreAutoRewrite as + MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE is usually better. (Adrien Grand) + Bug Fixes * LUCENE-5650: Enforce read-only access to any path outside the temporary @@ -178,6 +181,10 @@ Optimizations * LUCENE-5963: Reduce memory allocations in AnalyzingSuggester. (Markus Heiden via Mike McCandless) +* LUCENE-5938: MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE is now faster on + queries that match few documents by using a sparse bit set implementation. + (Adrien Grand) + Build * LUCENE-5909: Smoke tester now has better command line parsing and diff --git a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java deleted file mode 100644 index d26abe16c21..00000000000 --- a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java +++ /dev/null @@ -1,220 +0,0 @@ -package org.apache.lucene.search; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.ByteBlockPool; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; - -class ConstantScoreAutoRewrite extends TermCollectingRewrite { - - // Defaults derived from rough tests with a 20.0 million - // doc Wikipedia index. With more than 350 terms in the - // query, the filter method is fastest: - public static int DEFAULT_TERM_COUNT_CUTOFF = 350; - - // If the query will hit more than 1 in 1000 of the docs - // in the index (0.1%), the filter method is fastest: - public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; - - private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; - private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT; - - /** If the number of terms in this query is equal to or - * larger than this setting then {@link - * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used. */ - public void setTermCountCutoff(int count) { - termCountCutoff = count; - } - - /** @see #setTermCountCutoff */ - public int getTermCountCutoff() { - return termCountCutoff; - } - - /** If the number of documents to be visited in the - * postings exceeds this specified percentage of the - * maxDoc() for the index, then {@link - * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used. - * @param percent 0.0 to 100.0 */ - public void setDocCountPercent(double percent) { - docCountPercent = percent; - } - - /** @see #setDocCountPercent */ - public double getDocCountPercent() { - return docCountPercent; - } - - @Override - protected BooleanQuery getTopLevelQuery() { - return new BooleanQuery(true); - } - - @Override - protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) { - topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD); - } - - @Override - public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { - - // Get the enum and start visiting terms. If we - // exhaust the enum before hitting either of the - // cutoffs, we use ConstantBooleanQueryRewrite; else, - // ConstantFilterRewrite: - final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); - final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - - final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); - collectTerms(reader, query, col); - final int size = col.pendingTerms.size(); - if (col.hasCutOff) { - return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); - } else { - final BooleanQuery bq = getTopLevelQuery(); - if (size > 0) { - final BytesRefHash pendingTerms = col.pendingTerms; - final int sort[] = pendingTerms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); - for(int i = 0; i < size; i++) { - final int pos = sort[i]; - // docFreq is not used for constant score here, we pass 1 - // to explicitely set a fake value, so it's not calculated - addClause(bq, new Term(query.field, pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]); - } - } - // Strip scores - final Query result = new ConstantScoreQuery(bq); - result.setBoost(query.getBoost()); - return result; - } - } - - static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(int docCountCutoff, int termCountLimit) { - this.docCountCutoff = docCountCutoff; - this.termCountLimit = termCountLimit; - } - - @Override - public void setNextEnum(TermsEnum termsEnum) { - this.termsEnum = termsEnum; - } - - @Override - public boolean collect(BytesRef bytes) throws IOException { - int pos = pendingTerms.add(bytes); - docVisitCount += termsEnum.docFreq(); - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { - hasCutOff = true; - return false; - } - - final TermState termState = termsEnum.termState(); - assert termState != null; - if (pos < 0) { - pos = (-pos)-1; - array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); - } else { - array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); - } - return true; - } - - int docVisitCount = 0; - boolean hasCutOff = false; - TermsEnum termsEnum; - - final int docCountCutoff, termCountLimit; - final TermStateByteStart array = new TermStateByteStart(16); - final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array); - } - - @Override - public int hashCode() { - final int prime = 1279; - return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent)); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; - if (other.termCountCutoff != termCountCutoff) { - return false; - } - - if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) { - return false; - } - - return true; - } - - /** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */ - static final class TermStateByteStart extends DirectBytesStartArray { - TermContext[] termState; - - public TermStateByteStart(int initSize) { - super(initSize); - } - - @Override - public int[] init() { - final int[] ord = super.init(); - termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - assert termState.length >= ord.length; - return ord; - } - - @Override - public int[] grow() { - final int[] ord = super.grow(); - if (termState.length < ord.length) { - TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(termState, 0, tmpTermState, 0, termState.length); - termState = tmpTermState; - } - assert termState.length >= ord.length; - return ord; - } - - @Override - public int[] clear() { - termState = null; - return super.clear(); - } - - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java index 7fb8da6b40b..1d148338c37 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -49,8 +49,8 @@ import org.apache.lucene.util.AttributeSource; * prevents this. * *

The recommended rewrite method is {@link - * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}: it doesn't spend CPU - * computing unhelpful scores, and it tries to pick the most + * #CONSTANT_SCORE_FILTER_REWRITE}: it doesn't spend CPU + * computing unhelpful scores, and is the most * performant rewrite method given the query. If you * need scoring (like {@link FuzzyQuery}, use * {@link TopTermsScoringBooleanQueryRewrite} which uses @@ -58,12 +58,12 @@ import org.apache.lucene.util.AttributeSource; * and not hit this limitation. * * Note that org.apache.lucene.queryparser.classic.QueryParser produces - * MultiTermQueries using {@link - * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} by default. + * MultiTermQueries using {@link #CONSTANT_SCORE_FILTER_REWRITE} + * by default. */ public abstract class MultiTermQuery extends Query { protected final String field; - protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; + protected RewriteMethod rewriteMethod = CONSTANT_SCORE_FILTER_REWRITE; /** Abstract class that defines how the query is rewritten. */ public static abstract class RewriteMethod { @@ -104,7 +104,7 @@ public abstract class MultiTermQuery extends Query { * query. Note that typically such scores are * meaningless to the user, and require non-trivial CPU * to compute, so it's almost always better to use {@link - * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. + * #CONSTANT_SCORE_FILTER_REWRITE} instead. * *

NOTE: This rewrite method will hit {@link * BooleanQuery.TooManyClauses} if the number of terms @@ -208,40 +208,6 @@ public abstract class MultiTermQuery extends Query { topLevel.add(q, BooleanClause.Occur.SHOULD); } } - - /** A rewrite method that tries to pick the best - * constant-score rewrite method based on term and - * document counts from the query. If both the number of - * terms and documents is small enough, then {@link - * #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} is used. - * Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is - * used. - */ - public static class ConstantScoreAutoRewrite extends org.apache.lucene.search.ConstantScoreAutoRewrite {} - - /** Read-only default instance of {@link - * ConstantScoreAutoRewrite}, with {@link - * ConstantScoreAutoRewrite#setTermCountCutoff} set to - * {@link - * ConstantScoreAutoRewrite#DEFAULT_TERM_COUNT_CUTOFF} - * and {@link - * ConstantScoreAutoRewrite#setDocCountPercent} set to - * {@link - * ConstantScoreAutoRewrite#DEFAULT_DOC_COUNT_PERCENT}. - * Note that you cannot alter the configuration of this - * instance; you'll need to create a private instance - * instead. */ - public final static RewriteMethod CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new ConstantScoreAutoRewrite() { - @Override - public void setTermCountCutoff(int count) { - throw new UnsupportedOperationException("Please create a private instance"); - } - - @Override - public void setDocCountPercent(double percent) { - throw new UnsupportedOperationException("Please create a private instance"); - } - }; /** * Constructs a query matching terms that cannot be represented with a single diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java index 403d529e2b8..dfad53ed9b3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java @@ -19,14 +19,14 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.DocIdSetBuilder; /** * A wrapper for {@link MultiTermQuery}, that exposes its @@ -43,7 +43,7 @@ import org.apache.lucene.util.Bits; * this is why it is not abstract. */ public class MultiTermQueryWrapperFilter extends Filter { - + protected final Q query; /** @@ -52,7 +52,7 @@ public class MultiTermQueryWrapperFilter extends Filte protected MultiTermQueryWrapperFilter(Q query) { this.query = query; } - + @Override public String toString() { // query.toString should be ok for the filter, too, if the query boost is 1.0f @@ -77,7 +77,7 @@ public class MultiTermQueryWrapperFilter extends Filte /** Returns the field name for this query */ public final String getField() { return query.getField(); } - + /** * Returns a DocIdSet with documents that should be permitted in search * results. @@ -99,24 +99,13 @@ public class MultiTermQueryWrapperFilter extends Filte final TermsEnum termsEnum = query.getTermsEnum(terms); assert termsEnum != null; - if (termsEnum.next() != null) { - // fill into a FixedBitSet - final FixedBitSet bitSet = new FixedBitSet(context.reader().maxDoc()); - DocsEnum docsEnum = null; - do { - // System.out.println(" iter termCount=" + termCount + " term=" + - // enumerator.term().toBytesString()); - docsEnum = termsEnum.docs(acceptDocs, docsEnum, DocsEnum.FLAG_NONE); - int docid; - while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - bitSet.set(docid); - } - } while (termsEnum.next() != null); - // System.out.println(" done termCount=" + termCount); - return bitSet; - } else { - return null; + DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc()); + DocsEnum docs = null; + while (termsEnum.next() != null) { + docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); + builder.or(docs); } + return builder.build(); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/NumericRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/NumericRangeQuery.java index 36784b08121..6736516748f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/NumericRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/NumericRangeQuery.java @@ -72,7 +72,7 @@ import org.apache.lucene.index.Term; // for javadocs * details. * *

This query defaults to {@linkplain - * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}. + * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}. * With precision steps of ≤4, this query can be run with * one of the BooleanQuery rewrite methods without changing * BooleanQuery's default max clause count. diff --git a/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java index e1aaa0ca1b1..84bdca2afb0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java @@ -29,7 +29,7 @@ import org.apache.lucene.util.ToStringUtils; * is built by QueryParser for input like app*. * *

This query uses the {@link - * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} + * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} * rewrite method. */ public class PrefixQuery extends MultiTermQuery { private Term prefix; diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java index 954b2bbb596..47d9740dbdd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java @@ -45,7 +45,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr * query. Note that typically such scores are * meaningless to the user, and require non-trivial CPU * to compute, so it's almost always better to use {@link - * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. + * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} instead. * *

NOTE: This rewrite method will hit {@link * BooleanQuery.TooManyClauses} if the number of terms diff --git a/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java index 2f744ffd38f..794d5cd5beb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java @@ -35,7 +35,7 @@ import org.apache.lucene.util.ToStringUtils; * for numerical ranges; use {@link NumericRangeQuery} instead. * *

This query uses the {@link - * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} + * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} * rewrite method. * @since 2.9 */ diff --git a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java index 91b473e751f..eae96855c5e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java @@ -35,7 +35,7 @@ import org.apache.lucene.util.automaton.Automaton; * a Wildcard term should not start with the wildcard * * *

This query uses the {@link - * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} + * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} * rewrite method. * * @see AutomatonQuery diff --git a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java new file mode 100644 index 00000000000..28d478c9f7f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java @@ -0,0 +1,94 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A builder of {@link DocIdSet}s that supports random access. + * @lucene.internal + */ +public final class DocIdSetBuilder { + + private final int maxDoc; + private final int threshold; + private SparseFixedBitSet sparseSet; + private FixedBitSet denseSet; + + // we cache an upper bound of the cost of this builder so that we don't have + // to re-compute approximateCardinality on the sparse set every time + private long costUpperBound; + + /** Sole constructor. */ + public DocIdSetBuilder(int maxDoc) { + this.maxDoc = maxDoc; + threshold = maxDoc >>> 14; + } + + /** + * Add the content of the provided {@link DocIdSetIterator} to this builder. + */ + public void or(DocIdSetIterator it) throws IOException { + if (denseSet != null) { + // already upgraded + denseSet.or(it); + return; + } + + final long itCost = it.cost(); + costUpperBound += itCost; + if (costUpperBound >= threshold) { + costUpperBound = (sparseSet == null ? 0 : sparseSet.approximateCardinality()) + itCost; + + if (costUpperBound >= threshold) { + // upgrade + denseSet = new FixedBitSet(maxDoc); + denseSet.or(it); + if (sparseSet != null) { + denseSet.or(sparseSet.iterator()); + } + return; + } + } + + // we are still sparse + if (sparseSet == null) { + sparseSet = new SparseFixedBitSet(maxDoc); + } + sparseSet.or(it); + } + + /** + * Build a {@link DocIdSet} that contains all doc ids that have been added. + * This method may return null if no documents were addded to this + * builder. + * NOTE: this is a destructive operation, the builder should not be used + * anymore after this method has been called. + */ + public DocIdSet build() { + final DocIdSet result = denseSet != null ? denseSet : sparseSet; + denseSet = null; + sparseSet = null; + costUpperBound = 0; + return result; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java new file mode 100644 index 00000000000..cf7f5940cdb --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/SparseFixedBitSet.java @@ -0,0 +1,341 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A bit set that only stores longs that have at least one bit which is set. + * The way it works is that the space of bits is divided into blocks of + * 4096 bits, which is 64 longs. Then for each block, we have:

+ * + * @lucene.internal + */ +public class SparseFixedBitSet extends DocIdSet implements Bits { + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SparseFixedBitSet.class); + private static final long SINGLE_ELEMENT_ARRAY_BYTES_USED = RamUsageEstimator.sizeOf(new long[1]); + + private static int blockCount(int length) { + int blockCount = length >>> 12; + if ((blockCount << 12) < length) { + ++blockCount; + } + assert (blockCount << 12) >= length; + return blockCount; + } + + final long[] indices; + final long[][] bits; + final int length; + int nonZeroLongCount; + long ramBytesUsed; + + /** Create a {@link SparseFixedBitSet} that can contain bits between + * 0 included and length excluded. */ + public SparseFixedBitSet(int length) { + if (length < 1) { + throw new IllegalArgumentException("length needs to be >= 1"); + } + this.length = length; + final int blockCount = blockCount(length); + indices = new long[blockCount]; + bits = new long[blockCount][]; + ramBytesUsed = BASE_RAM_BYTES_USED + + RamUsageEstimator.shallowSizeOf(indices) + + RamUsageEstimator.shallowSizeOf(bits); + } + + @Override + public boolean isCacheable() { + return true; + } + + @Override + public int length() { + return length; + } + + @Override + public Bits bits() throws IOException { + return this; + } + + private boolean consistent(int index) { + assert index >= 0 && index < length : "index=" + index + ",length=" + length; + return true; + } + + /** + * Compute the cardinality of this set. + * NOTE: this operation runs in linear time. + */ + public int cardinality() { + int cardinality = 0; + for (long[] bitArray : bits) { + if (bitArray != null) { + for (long bits : bitArray) { + cardinality += Long.bitCount(bits); + } + } + } + return cardinality; + } + + /** + * Return an approximation of the cardinality of this set, assuming that bits + * are uniformly distributed. This operation runs in constant time. + */ + public int approximateCardinality() { + // this is basically the linear counting algorithm + final int totalLongs = (length + 63) >>> 6; // total number of longs in the space + assert totalLongs >= nonZeroLongCount; + final int zeroLongs = totalLongs - nonZeroLongCount; // number of longs that are zeros + // No need to guard against division by zero, it will return +Infinity and things will work as expected + final long estimate = Math.round(totalLongs * Math.log((double) totalLongs / zeroLongs)); + return (int) Math.min(length, estimate); + } + + @Override + public boolean get(int i) { + assert consistent(i); + final int i4096 = i >>> 12; + final long index = indices[i4096]; + final int i64 = i >>> 6; + // first check the index, if the i64-th bit is not set, then i is not set + // note: this relies on the fact that shifts are mod 64 in java + if ((index & (1L << i64)) == 0) { + return false; + } + + // if it is set, then we count the number of bits that are set on the right + // of i64, and that gives us the index of the long that stores the bits we + // are interested in + final long bits = this.bits[i4096][Long.bitCount(index & ((1L << i64) - 1))]; + return (bits & (1L << i)) != 0; + } + + private static int oversize(int s) { + int newSize = s + (s >>> 1); + if (newSize > 50) { + newSize = 64; + } + return newSize; + } + + /** + * Set the bit at index i. + */ + public void set(int i) { + assert consistent(i); + final int i4096 = i >>> 12; + final long index = indices[i4096]; + final int i64 = i >>> 6; + if (index == 0) { + // if the index is 0, it means that we just found a block of 4096 bits + // that has no bit that is set yet. So let's initialize a new block: + indices[i4096] = 1L << i64; // shifts are mod 64 in java + assert bits[i4096] == null; + bits[i4096] = new long[] { 1L << i }; // shifts are mod 64 in java + ++nonZeroLongCount; + ramBytesUsed += SINGLE_ELEMENT_ARRAY_BYTES_USED; + } else if ((index & (1L << i64)) == 0) { + // in that case we found a block of 4096 bits that has some values, but + // the sub-block of 64 bits that we are interested in has no value yet, + // so we need to insert a new long + insertLong(i4096, i64, i, index); + } else { + // in that case the sub 64-bits block we are interested in already exists, + // we just need to set a bit in an existing long: the number of ones on + // the right of i64 gives us the index of the long we need to update + bits[i4096][Long.bitCount(index & ((1L << i64) - 1))] |= 1L << i; // shifts are mod 64 in java + } + } + + private void insertLong(int i4096, int i64, int i, long index) { + indices[i4096] |= 1L << i64; // shifts are mod 64 in java + // we count the number of bits that are set on the right of i64 + // this gives us the index at which to perform the insertion + final int o = Long.bitCount(index & ((1L << i64) - 1)); + final long[] bitArray = bits[i4096]; + if (bitArray[bitArray.length - 1] == 0) { + // since we only store non-zero longs, if the last value is 0, it means + // that we alreay have extra space, make use of it + System.arraycopy(bitArray, o, bitArray, o + 1, bitArray.length - o - 1); + bitArray[o] = 1L << i; + } else { + // we don't have extra space so we need to resize to insert the new long + final int newSize = oversize(bitArray.length + 1); + final long[] newBitArray = new long[newSize]; + System.arraycopy(bitArray, 0, newBitArray, 0, o); + newBitArray[o] = 1L << i; + System.arraycopy(bitArray, o, newBitArray, o + 1, bitArray.length - o); + bits[i4096] = newBitArray; + ramBytesUsed += (newSize - bitArray.length) * RamUsageEstimator.NUM_BYTES_LONG; + } + ++nonZeroLongCount; + } + + /** + * Add the documents contained in the provided {@link DocIdSetIterator} to + * this bit set. + */ + public void or(DocIdSetIterator it) throws IOException { + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + set(doc); + } + } + + @Override + public long ramBytesUsed() { + return ramBytesUsed; + } + + @Override + public DocIdSetIterator iterator() throws IOException { + return new Iterator(); + } + + class Iterator extends DocIdSetIterator { + + private int doc = -1; + private int cost = -1; + + @Override + public int docID() { + return doc; + } + + /** Return the first document that occurs on or after the provided block index. */ + private int firstDoc(int i4096) { + long index = 0; + while (i4096 < indices.length) { + index = indices[i4096]; + if (index != 0) { + final int i64 = Long.numberOfTrailingZeros(index); + return doc = (i4096 << 12) | (i64 << 6) | Long.numberOfTrailingZeros(bits[i4096][0]); + } + i4096 += 1; + } + return doc = NO_MORE_DOCS; + } + + @Override + public int nextDoc() throws IOException { + if (doc == NO_MORE_DOCS || ++doc >= length) { + return doc = NO_MORE_DOCS; + } + return currentOrNextDoc(); + } + + private int currentOrNextDoc() { + final int i4096 = doc >>> 12; + final long index = indices[i4096]; + if (index == 0) { + // if the index is zero, it means that there is no value in the + // current block, so return the first document of the next block + return firstDoc(i4096 + 1); + } else { + // now we are on a block that contains at least one document + assert Long.bitCount(index) <= bits[i4096].length; + int i64 = doc >>> 6; + long indexBits = index >>> i64; // shifts are mod 64 in java + if (indexBits == 0) { + // if neither the i64-th bit or any other bit on its left is set then + // it means that there are no more documents in this block, go to the + // next one + return firstDoc(i4096 + 1); + } + + // We know we still have some 64-bits blocks that have bits set, let's + // advance to the next one by skipping trailing zeros of the index + int i1 = doc & 0x3F; + int trailingZeros = Long.numberOfTrailingZeros(indexBits); + if (trailingZeros != 0) { + // no bits in the current long, go to the next one + i64 += trailingZeros; + i1 = 0; + } + + // So now we are on a sub 64-bits block that has values + assert (index & (1L << i64)) != 0; + // we count the number of ones on the left of i64 to figure out the + // index of the long that contains the bits we are interested in + int longIndex = Long.bitCount(index & ((1L << i64) - 1)); // shifts are mod 64 in java + final long[] longArray = bits[i4096]; + assert longArray[longIndex] != 0; + long bits = SparseFixedBitSet.this.bits[i4096][longIndex] >>> i1; // shifts are mod 64 in java + if (bits != 0L) { + // hurray, we found some non-zero bits, this gives us the next document: + i1 += Long.numberOfTrailingZeros(bits); + return doc = (i4096 << 12) | ((i64 & 0x3F) << 6) | i1; + } + + // otherwise it means that although we were on a sub-64 block that contains + // documents, all documents of this sub-block have already been consumed + // so two cases: + indexBits = index >>> i64 >>> 1; // we don't shift by (i64+1) otherwise we might shift by a multiple of 64 which is a no-op + if (indexBits == 0) { + // Case 1: this was the last long of the block of 4096 bits, then go + // to the next block + return firstDoc(i4096 + 1); + } + // Case 2: go to the next sub 64-bits block in the current block of 4096 bits + // by skipping trailing zeros of the index + trailingZeros = Long.numberOfTrailingZeros(indexBits); + i64 += 1 + trailingZeros; + bits = longArray[longIndex + 1]; + assert bits != 0; + i1 = Long.numberOfTrailingZeros(bits); + return doc = (i4096 << 12) | ((i64 & 0x3F) << 6) | i1; + } + } + + @Override + public int advance(int target) throws IOException { + if (target >= length) { + return doc = NO_MORE_DOCS; + } else { + doc = target; + } + + return currentOrNextDoc(); + } + + @Override + public long cost() { + // although constant-time, approximateCardinality is a bit expensive so + // we cache it to avoid performance traps eg. when sorting iterators by + // cost + if (cost < 0) { + cost = approximateCardinality(); + } + assert cost >= 0; + return cost; + } + + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestReaderClosed.java b/lucene/core/src/test/org/apache/lucene/index/TestReaderClosed.java index bbfa396603f..1e603738b7e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestReaderClosed.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestReaderClosed.java @@ -17,6 +17,8 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.util.concurrent.ExecutionException; + import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; @@ -81,7 +83,14 @@ public class TestReaderClosed extends LuceneTestCase { reader.close(); // close original child reader try { searcher.search(query, 5); - } catch (AlreadyClosedException ace) { + } catch (Exception e) { + AlreadyClosedException ace = null; + for (Throwable t = e; t != null; t = t.getCause()) { + if (t instanceof AlreadyClosedException) { + ace = (AlreadyClosedException) t; + } + } + assertNotNull("Query failed, but not due to an AlreadyClosedException", ace); assertEquals( "this IndexReader cannot be used anymore as one of its child readers was closed", ace.getMessage() diff --git a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java index 02c4b3ace5e..253dc9a7992 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java @@ -98,9 +98,6 @@ public class TestAutomatonQuery extends LuceneTestCase { query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); assertEquals(expected, automatonQueryNrHits(query)); - - query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); - assertEquals(expected, automatonQueryNrHits(query)); } /** diff --git a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java index 0606ad3a095..6553ea3fc16 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java @@ -110,9 +110,6 @@ public class TestAutomatonQueryUnicode extends LuceneTestCase { query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); assertEquals(expected, automatonQueryNrHits(query)); - - query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); - assertEquals(expected, automatonQueryNrHits(query)); } /** diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java b/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java index 785ac970576..a5a7f14d11b 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java @@ -160,7 +160,7 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { result[i].score, SCORE_COMP_THRESH); } - result = search.search(csrq("data", "1", "6", T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, 1000).scoreDocs; + result = search.search(csrq("data", "1", "6", T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, 1000).scoreDocs; numHits = result.length; assertEquals("wrong number of results", 6, numHits); for (int i = 0; i < numHits; i++) { @@ -204,7 +204,7 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { bq = new BooleanQuery(); bq.add(dummyTerm, BooleanClause.Occur.SHOULD); // hits one doc - bq.add(csrq("data", "#", "#", T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), BooleanClause.Occur.SHOULD); // hits no docs + bq.add(csrq("data", "#", "#", T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), BooleanClause.Occur.SHOULD); // hits no docs result = search.search(bq, null, 1000).scoreDocs; numHits = result.length; assertEquals("wrong number of results", 1, numHits); @@ -346,37 +346,37 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { result = search.search(csrq("id", minIP, maxIP, T, T), null, numDocs).scoreDocs; assertEquals("find all", numDocs, result.length); - result = search.search(csrq("id", minIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", minIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("find all", numDocs, result.length); result = search.search(csrq("id", minIP, maxIP, T, F), null, numDocs).scoreDocs; assertEquals("all but last", numDocs - 1, result.length); - result = search.search(csrq("id", minIP, maxIP, T, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", minIP, maxIP, T, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("all but last", numDocs - 1, result.length); result = search.search(csrq("id", minIP, maxIP, F, T), null, numDocs).scoreDocs; assertEquals("all but first", numDocs - 1, result.length); - result = search.search(csrq("id", minIP, maxIP, F, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", minIP, maxIP, F, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("all but first", numDocs - 1, result.length); result = search.search(csrq("id", minIP, maxIP, F, F), null, numDocs).scoreDocs; assertEquals("all but ends", numDocs - 2, result.length); - result = search.search(csrq("id", minIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", minIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("all but ends", numDocs - 2, result.length); result = search.search(csrq("id", medIP, maxIP, T, T), null, numDocs).scoreDocs; assertEquals("med and up", 1 + maxId - medId, result.length); - result = search.search(csrq("id", medIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", medIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("med and up", 1 + maxId - medId, result.length); result = search.search(csrq("id", minIP, medIP, T, T), null, numDocs).scoreDocs; assertEquals("up to med", 1 + medId - minId, result.length); - result = search.search(csrq("id", minIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", minIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("up to med", 1 + medId - minId, result.length); // unbounded id @@ -404,49 +404,49 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { result = search.search(csrq("id", minIP, minIP, F, F), null, numDocs).scoreDocs; assertEquals("min,min,F,F", 0, result.length); - result = search.search(csrq("id", minIP, minIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", minIP, minIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("min,min,F,F", 0, result.length); result = search.search(csrq("id", medIP, medIP, F, F), null, numDocs).scoreDocs; assertEquals("med,med,F,F", 0, result.length); - result = search.search(csrq("id", medIP, medIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", medIP, medIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("med,med,F,F", 0, result.length); result = search.search(csrq("id", maxIP, maxIP, F, F), null, numDocs).scoreDocs; assertEquals("max,max,F,F", 0, result.length); - result = search.search(csrq("id", maxIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", maxIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("max,max,F,F", 0, result.length); result = search.search(csrq("id", minIP, minIP, T, T), null, numDocs).scoreDocs; assertEquals("min,min,T,T", 1, result.length); - result = search.search(csrq("id", minIP, minIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", minIP, minIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("min,min,T,T", 1, result.length); result = search.search(csrq("id", null, minIP, F, T), null, numDocs).scoreDocs; assertEquals("nul,min,F,T", 1, result.length); - result = search.search(csrq("id", null, minIP, F, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", null, minIP, F, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("nul,min,F,T", 1, result.length); result = search.search(csrq("id", maxIP, maxIP, T, T), null, numDocs).scoreDocs; assertEquals("max,max,T,T", 1, result.length); - result = search.search(csrq("id", maxIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", maxIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("max,max,T,T", 1, result.length); result = search.search(csrq("id", maxIP, null, T, F), null, numDocs).scoreDocs; assertEquals("max,nul,T,T", 1, result.length); - result = search.search(csrq("id", maxIP, null, T, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", maxIP, null, T, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("max,nul,T,T", 1, result.length); result = search.search(csrq("id", medIP, medIP, T, T), null, numDocs).scoreDocs; assertEquals("med,med,T,T", 1, result.length); - result = search.search(csrq("id", medIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs; + result = search.search(csrq("id", medIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs; assertEquals("med,med,T,T", 1, result.length); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java b/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java index dc2754e5f27..62c8ff0cc36 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java @@ -138,12 +138,6 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase { // use a large PQ here to only test duplicate terms and dont mix up when all scores are equal checkDuplicateTerms(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024)); checkDuplicateTerms(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(1024)); - - // Test auto rewrite (but only boolean mode), so we set the limits to large values to always get a BQ - final MultiTermQuery.ConstantScoreAutoRewrite rewrite = new MultiTermQuery.ConstantScoreAutoRewrite(); - rewrite.setTermCountCutoff(Integer.MAX_VALUE); - rewrite.setDocCountPercent(100.); - checkDuplicateTerms(rewrite); } private void checkBooleanQueryBoosts(BooleanQuery bq) { @@ -235,7 +229,6 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase { checkMaxClauseLimitation(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); checkNoMaxClauseLimitation(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); - checkNoMaxClauseLimitation(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); checkNoMaxClauseLimitation(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024)); checkNoMaxClauseLimitation(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(1024)); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java b/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java index ac60c880c67..dbbaad634b0 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java @@ -87,12 +87,6 @@ public class TestWildcard assertTrue(q instanceof ConstantScoreQuery); assertEquals(q.getBoost(), wq.getBoost(), 0.1); - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); - wq.setBoost(0.3F); - q = searcher.rewrite(wq); - assertTrue(q instanceof ConstantScoreQuery); - assertEquals(q.getBoost(), wq.getBoost(), 0.1); - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); wq.setBoost(0.4F); q = searcher.rewrite(wq); diff --git a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java new file mode 100644 index 00000000000..d180cea9e6c --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java @@ -0,0 +1,92 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + +public class TestDocIdSetBuilder extends LuceneTestCase { + + public void testEmpty() throws IOException { + assertEquals(null, new DocIdSetBuilder(1 + random().nextInt(1000)).build()); + } + + private void assertEquals(DocIdSet d1, DocIdSet d2) throws IOException { + if (d1 == null) { + if (d2 != null) { + assertEquals(DocIdSetIterator.NO_MORE_DOCS, d2.iterator().nextDoc()); + } + } else if (d2 == null) { + assertEquals(DocIdSetIterator.NO_MORE_DOCS, d1.iterator().nextDoc()); + } else { + DocIdSetIterator i1 = d1.iterator(); + DocIdSetIterator i2 = d2.iterator(); + for (int doc = i1.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = i1.nextDoc()) { + assertEquals(doc, i2.nextDoc()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, i2.nextDoc()); + } + } + + public void testSparse() throws IOException { + final int maxDoc = 1000000 + random().nextInt(1000000); + DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc); + final int numIterators = 1 + random().nextInt(10); + final FixedBitSet ref = new FixedBitSet(maxDoc); + for (int i = 0; i < numIterators; ++i) { + final int baseInc = 200000 + random().nextInt(10000); + WAH8DocIdSet.Builder b = new WAH8DocIdSet.Builder(); + for (int doc = random().nextInt(100); doc < maxDoc; doc += baseInc + random().nextInt(10000)) { + b.add(doc); + ref.set(doc); + } + builder.or(b.build().iterator()); + } + DocIdSet result = builder.build(); + assertTrue(result instanceof SparseFixedBitSet); + assertEquals(ref, result); + } + + public void testDense() throws IOException { + final int maxDoc = 1000000 + random().nextInt(1000000); + DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc); + final int numIterators = 1 + random().nextInt(10); + final FixedBitSet ref = new FixedBitSet(maxDoc); + if (random().nextBoolean()) { + // try upgrades + final int doc = random().nextInt(maxDoc); + ref.set(doc); + builder.or(new WAH8DocIdSet.Builder().add(doc).build().iterator()); + } + for (int i = 0; i < numIterators; ++i) { + final int baseInc = 2 + random().nextInt(10000); + WAH8DocIdSet.Builder b = new WAH8DocIdSet.Builder(); + for (int doc = random().nextInt(10000); doc < maxDoc; doc += baseInc + random().nextInt(2000)) { + b.add(doc); + ref.set(doc); + } + builder.or(b.build().iterator()); + } + DocIdSet result = builder.build(); + assertTrue(result instanceof FixedBitSet); + assertEquals(ref, result); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/util/TestSparseFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestSparseFixedBitSet.java new file mode 100644 index 00000000000..e6d06fa972b --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestSparseFixedBitSet.java @@ -0,0 +1,81 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collections; +import java.util.List; + +public class TestSparseFixedBitSet extends BaseDocIdSetTestCase { + + @Override + public SparseFixedBitSet copyOf(BitSet bs, int length) throws IOException { + final SparseFixedBitSet set = new SparseFixedBitSet(length); + // SparseFixedBitSet can be sensitive to the order of insertion so + // randomize insertion a bit + List buffer = new ArrayList<>(); + for (int doc = bs.nextSetBit(0); doc != -1; doc = bs.nextSetBit(doc + 1)) { + buffer.add(doc); + if (buffer.size() >= 100000) { + Collections.shuffle(buffer); + for (int i : buffer) { + set.set(i); + } + buffer.clear(); + } + } + Collections.shuffle(buffer); + for (int i : buffer) { + set.set(i); + } + return set; + } + + @Override + public void assertEquals(int numBits, BitSet ds1, SparseFixedBitSet ds2) throws IOException { + for (int i = 0; i < numBits; ++i) { + assertEquals(ds1.get(i), ds2.get(i)); + } + assertEquals(ds1.cardinality(), ds2.cardinality()); + super.assertEquals(numBits, ds1, ds2); + } + + public void testApproximateCardinality() { + final SparseFixedBitSet set = new SparseFixedBitSet(10000); + final int first = random().nextInt(1000); + final int interval = 200 + random().nextInt(1000); + for (int i = first; i < set.length(); i += interval) { + set.set(i); + } + assertEquals(set.cardinality(), set.approximateCardinality(), 20); + } + + public void testApproximateCardinalityOnDenseSet() { + // this tests that things work as expected in approximateCardinality when + // all longs are different than 0, in which case we divide by zero + final int numDocs = 70;//TestUtil.nextInt(random(), 1, 10000); + final SparseFixedBitSet set = new SparseFixedBitSet(numDocs); + for (int i = 0; i < set.length(); ++i) { + set.set(i); + } + assertEquals(numDocs, set.approximateCardinality()); + } + +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/TermsFilter.java b/lucene/queries/src/java/org/apache/lucene/queries/TermsFilter.java index 615a1de86b3..9a4208668b2 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/TermsFilter.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/TermsFilter.java @@ -17,15 +17,6 @@ package org.apache.lucene.queries; * limitations under the License. */ -import org.apache.lucene.index.*; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Filter; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.FixedBitSet; - import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -33,6 +24,20 @@ import java.util.Collections; import java.util.Iterator; import java.util.List; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.Filter; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.DocIdSetBuilder; + /** * Constructs a filter for docs matching any of the terms added to this class. * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in @@ -178,11 +183,11 @@ public final class TermsFilter extends Filter { @Override public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException { final LeafReader reader = context.reader(); - FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time + DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc()); final Fields fields = reader.fields(); final BytesRef spare = new BytesRef(this.termsBytes); if (fields == null) { - return result; + return builder.build(); } Terms terms = null; TermsEnum termsEnum = null; @@ -195,21 +200,12 @@ public final class TermsFilter extends Filter { spare.length = offsets[i+1] - offsets[i]; if (termsEnum.seekExact(spare)) { docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); // no freq since we don't need them - if (result == null) { - if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - result = new FixedBitSet(reader.maxDoc()); - // lazy init but don't do it in the hot loop since we could read many docs - result.set(docs.docID()); - } - } - while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - result.set(docs.docID()); - } + builder.or(docs); } } } } - return result; + return builder.build(); } @Override diff --git a/lucene/queries/src/test/org/apache/lucene/queries/TermsFilterTest.java b/lucene/queries/src/test/org/apache/lucene/queries/TermsFilterTest.java index 8d1be0c5495..553cda26570 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/TermsFilterTest.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/TermsFilterTest.java @@ -28,19 +28,19 @@ import java.util.Set; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java index d84bbc7a886..5116c6ce00d 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java @@ -61,7 +61,7 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer Operator operator = OR_OPERATOR; boolean lowercaseExpandedTerms = true; - MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; + MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE; boolean allowLeadingWildcard = false; protected String field; @@ -274,7 +274,7 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer } /** - * By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} + * By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} * when creating a {@link PrefixQuery}, {@link WildcardQuery} or {@link TermRangeQuery}. This implementation is generally preferable because it * a) Runs faster b) Does not have the scarcity of terms unduly influence score * c) avoids any {@link TooManyClauses} exception. diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/CommonQueryParserConfiguration.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/CommonQueryParserConfiguration.java index 7c305f38bcb..3902349c83b 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/CommonQueryParserConfiguration.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/CommonQueryParserConfiguration.java @@ -73,7 +73,7 @@ public interface CommonQueryParserConfiguration { /** * By default, it uses - * {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} when creating a + * {@link MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} when creating a * prefix, wildcard and range queries. This implementation is generally * preferable because it a) Runs faster b) Does not have the scarcity of terms * unduly influence score c) avoids any {@link TooManyListenersException} diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.java index 2abb6a7291e..a2b3b541ec3 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.java @@ -265,7 +265,7 @@ public class StandardQueryParser extends QueryParserHelper implements CommonQuer /** * By default, it uses - * {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} when creating a + * {@link MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} when creating a * prefix, wildcard and range queries. This implementation is generally * preferable because it a) Runs faster b) Does not have the scarcity of terms * unduly influence score c) avoids any {@link TooManyListenersException} diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/config/StandardQueryConfigHandler.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/config/StandardQueryConfigHandler.java index a7fd34a4ef1..81ef7c13e0d 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/config/StandardQueryConfigHandler.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/config/StandardQueryConfigHandler.java @@ -208,7 +208,7 @@ public class StandardQueryConfigHandler extends QueryConfigHandler { set(ConfigurationKeys.FIELD_BOOST_MAP, new LinkedHashMap()); set(ConfigurationKeys.FUZZY_CONFIG, new FuzzyConfig()); set(ConfigurationKeys.LOCALE, Locale.getDefault()); - set(ConfigurationKeys.MULTI_TERM_REWRITE_METHOD, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + set(ConfigurationKeys.MULTI_TERM_REWRITE_METHOD, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); set(ConfigurationKeys.FIELD_DATE_RESOLUTION_MAP, new HashMap()); } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/processors/MultiTermRewriteMethodProcessor.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/processors/MultiTermRewriteMethodProcessor.java index f14b3bb5125..5894c6a75ba 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/processors/MultiTermRewriteMethodProcessor.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/processors/MultiTermRewriteMethodProcessor.java @@ -30,7 +30,7 @@ import org.apache.lucene.search.MultiTermQuery; /** * This processor instates the default * {@link org.apache.lucene.search.MultiTermQuery.RewriteMethod}, - * {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}, for multi-term + * {@link MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}, for multi-term * query nodes. */ public class MultiTermRewriteMethodProcessor extends QueryNodeProcessorImpl { diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java index 70bafffb949..0c89d2b0a1f 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java @@ -321,15 +321,15 @@ public class TestQPHelper extends LuceneTestCase { StandardQueryParser qp = new StandardQueryParser(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); Query q = qp.parse("foo*bar", "field"); assertTrue(q instanceof WildcardQuery); - assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((MultiTermQuery) q).getRewriteMethod()); + assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((MultiTermQuery) q).getRewriteMethod()); q = qp.parse("foo*", "field"); assertTrue(q instanceof PrefixQuery); - assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((MultiTermQuery) q).getRewriteMethod()); + assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((MultiTermQuery) q).getRewriteMethod()); q = qp.parse("[a TO z]", "field"); assertTrue(q instanceof TermRangeQuery); - assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((MultiTermQuery) q).getRewriteMethod()); + assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((MultiTermQuery) q).getRewriteMethod()); } public void testCJK() throws Exception { @@ -659,7 +659,7 @@ public class TestQPHelper extends LuceneTestCase { public void testRange() throws Exception { assertQueryEquals("[ a TO z]", null, "[a TO z]"); - assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((TermRangeQuery)getQuery("[ a TO z]", null)).getRewriteMethod()); + assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((TermRangeQuery)getQuery("[ a TO z]", null)).getRewriteMethod()); StandardQueryParser qp = new StandardQueryParser(); @@ -1152,7 +1152,7 @@ public class TestQPHelper extends LuceneTestCase { assertTrue(qp.parse("/[A-Z][123]/^0.5", df) instanceof RegexpQuery); assertEquals(q, qp.parse("/[A-Z][123]/^0.5", df)); assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE, ((RegexpQuery)qp.parse("/[A-Z][123]/^0.5", df)).getRewriteMethod()); - qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); Query escaped = new RegexpQuery(new Term("field", "[a-z]\\/[123]")); assertEquals(escaped, qp.parse("/[a-z]\\/[123]/", df)); diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index b0c4e5f2e12..1181942846c 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -571,7 +571,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertQueryEquals("[ a TO z}", null, "[a TO z}"); assertQueryEquals("{ a TO z]", null, "{a TO z]"); - assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((TermRangeQuery)getQuery("[ a TO z]")).getRewriteMethod()); + assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((TermRangeQuery)getQuery("[ a TO z]")).getRewriteMethod()); CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); @@ -987,7 +987,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertTrue(getQuery("/[A-Z][123]/^0.5",qp) instanceof RegexpQuery); assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE, ((RegexpQuery)getQuery("/[A-Z][123]/^0.5",qp)).getRewriteMethod()); assertEquals(q, getQuery("/[A-Z][123]/^0.5",qp)); - qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); + qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); Query escaped = new RegexpQuery(new Term("field", "[a-z]\\/[123]")); assertEquals(escaped, getQuery("/[a-z]\\/[123]/",qp)); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java index 07c8c3d17ba..d0871e5a3e5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java @@ -16,15 +16,21 @@ package org.apache.lucene.sandbox.queries; * limitations under the License. */ -import org.apache.lucene.index.*; +import java.io.IOException; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Filter; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; - -import java.io.IOException; +import org.apache.lucene.util.SparseFixedBitSet; /** * Filter to remove duplicate values from search results. @@ -87,8 +93,8 @@ public class DuplicateFilter extends Filter { } } - private FixedBitSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException { - FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid + private SparseFixedBitSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException { + SparseFixedBitSet bits = new SparseFixedBitSet(reader.maxDoc()); //assume all are INvalid Terms terms = reader.fields().terms(fieldName); if (terms == null) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowCollatedTermRangeQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowCollatedTermRangeQuery.java index 407d38065f1..6d65bd0d894 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowCollatedTermRangeQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowCollatedTermRangeQuery.java @@ -36,7 +36,7 @@ import org.apache.lucene.util.ToStringUtils; * for numerical ranges; use {@link NumericRangeQuery} instead. * *

This query uses the {@link - * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} + * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} * rewrite method. * @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead. * This class will be removed in Lucene 5.0 diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/BaseDocIdSetTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/BaseDocIdSetTestCase.java index 404b518fb26..02f881f1021 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/BaseDocIdSetTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/BaseDocIdSetTestCase.java @@ -57,7 +57,7 @@ public abstract class BaseDocIdSetTestCase extends LuceneTes /** Test length=0. */ public void testNoBit() throws IOException { final BitSet bs = new BitSet(1); - final T copy = copyOf(bs, 0); + final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000)); assertEquals(0, bs, copy); } @@ -67,7 +67,7 @@ public abstract class BaseDocIdSetTestCase extends LuceneTes if (random().nextBoolean()) { bs.set(0); } - final T copy = copyOf(bs, 1); + final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000)); assertEquals(1, bs, copy); } @@ -80,7 +80,7 @@ public abstract class BaseDocIdSetTestCase extends LuceneTes if (random().nextBoolean()) { bs.set(1); } - final T copy = copyOf(bs, 2); + final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000)); assertEquals(2, bs, copy); } diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 1ef4d0acd9c..964ce0bd134 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -78,7 +78,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder { /** The default operator that parser uses to combine query terms */ Operator operator = OR_OPERATOR; - MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; + MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE; boolean allowLeadingWildcard = true; String defaultField; @@ -294,7 +294,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder { /** - * By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} + * By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} * when creating a PrefixQuery, WildcardQuery or RangeQuery. This implementation is generally preferable because it * a) Runs faster b) Does not have the scarcity of terms unduly influence score * c) avoids any "TooManyBooleanClauses" exception. diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 8fa39cf377a..13fb240e341 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -746,7 +746,7 @@ public abstract class FieldType extends FieldProperties { if (!field.indexed() && field.hasDocValues()) { return field.multiValued() ? new DocTermOrdsRewriteMethod() : new DocValuesRewriteMethod(); } else { - return MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; + return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE; } } diff --git a/solr/core/src/test/org/apache/solr/update/AddBlockUpdateTest.java b/solr/core/src/test/org/apache/solr/update/AddBlockUpdateTest.java index e9aae01b2f6..cf983b5501b 100644 --- a/solr/core/src/test/org/apache/solr/update/AddBlockUpdateTest.java +++ b/solr/core/src/test/org/apache/solr/update/AddBlockUpdateTest.java @@ -5,6 +5,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeFilter; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.join.FixedBitSetCachingWrapperFilter; import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.search.join.ToParentBlockJoinQuery; import org.apache.solr.SolrTestCaseJ4; @@ -34,6 +35,7 @@ import javax.xml.stream.XMLStreamReader; + import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; @@ -566,8 +568,8 @@ public class AddBlockUpdateTest extends SolrTestCaseJ4 { protected ToParentBlockJoinQuery join(final String childTerm) { return new ToParentBlockJoinQuery( - new TermQuery(new Term(child, childTerm)), new TermRangeFilter(parent, - null, null, false, false), ScoreMode.None); + new TermQuery(new Term(child, childTerm)), new FixedBitSetCachingWrapperFilter(new TermRangeFilter(parent, + null, null, false, false)), ScoreMode.None); } private Collection> callables(List blocks) {