LUCENE-5938: Add a new sparse fixed bit set and remove ConstantScoreAutoRewrite.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1628402 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2014-09-30 11:52:23 +00:00
parent 672fc08ec3
commit 8f96faf3d3
34 changed files with 718 additions and 374 deletions

View File

@ -136,6 +136,9 @@ API Changes
* LUCENE-5569: *AtomicReader/AtomicReaderContext have been renamed to *LeafReader/LeafReaderContext.
(Ryan Ernst)
* LUCENE-5938: Removed MultiTermQuery.ConstantScoreAutoRewrite as
MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE is usually better. (Adrien Grand)
Bug Fixes
* LUCENE-5650: Enforce read-only access to any path outside the temporary
@ -178,6 +181,10 @@ Optimizations
* LUCENE-5963: Reduce memory allocations in
AnalyzingSuggester. (Markus Heiden via Mike McCandless)
* LUCENE-5938: MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE is now faster on
queries that match few documents by using a sparse bit set implementation.
(Adrien Grand)
Build
* LUCENE-5909: Smoke tester now has better command line parsing and

View File

@ -1,220 +0,0 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
// Defaults derived from rough tests with a 20.0 million
// doc Wikipedia index. With more than 350 terms in the
// query, the filter method is fastest:
public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
// If the query will hit more than 1 in 1000 of the docs
// in the index (0.1%), the filter method is fastest:
public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
/** If the number of terms in this query is equal to or
* larger than this setting then {@link
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used. */
public void setTermCountCutoff(int count) {
termCountCutoff = count;
}
/** @see #setTermCountCutoff */
public int getTermCountCutoff() {
return termCountCutoff;
}
/** If the number of documents to be visited in the
* postings exceeds this specified percentage of the
* maxDoc() for the index, then {@link
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used.
* @param percent 0.0 to 100.0 */
public void setDocCountPercent(double percent) {
docCountPercent = percent;
}
/** @see #setDocCountPercent */
public double getDocCountPercent() {
return docCountPercent;
}
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) {
topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
}
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
// Get the enum and start visiting terms. If we
// exhaust the enum before hitting either of the
// cutoffs, we use ConstantBooleanQueryRewrite; else,
// ConstantFilterRewrite:
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
collectTerms(reader, query, col);
final int size = col.pendingTerms.size();
if (col.hasCutOff) {
return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
} else {
final BooleanQuery bq = getTopLevelQuery();
if (size > 0) {
final BytesRefHash pendingTerms = col.pendingTerms;
final int sort[] = pendingTerms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
for(int i = 0; i < size; i++) {
final int pos = sort[i];
// docFreq is not used for constant score here, we pass 1
// to explicitely set a fake value, so it's not calculated
addClause(bq, new Term(query.field, pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
}
}
// Strip scores
final Query result = new ConstantScoreQuery(bq);
result.setBoost(query.getBoost());
return result;
}
}
static final class CutOffTermCollector extends TermCollector {
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
this.docCountCutoff = docCountCutoff;
this.termCountLimit = termCountLimit;
}
@Override
public void setNextEnum(TermsEnum termsEnum) {
this.termsEnum = termsEnum;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
int pos = pendingTerms.add(bytes);
docVisitCount += termsEnum.docFreq();
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
final TermState termState = termsEnum.termState();
assert termState != null;
if (pos < 0) {
pos = (-pos)-1;
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
}
return true;
}
int docVisitCount = 0;
boolean hasCutOff = false;
TermsEnum termsEnum;
final int docCountCutoff, termCountLimit;
final TermStateByteStart array = new TermStateByteStart(16);
final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
}
@Override
public int hashCode() {
final int prime = 1279;
return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent));
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj;
if (other.termCountCutoff != termCountCutoff) {
return false;
}
if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) {
return false;
}
return true;
}
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */
static final class TermStateByteStart extends DirectBytesStartArray {
TermContext[] termState;
public TermStateByteStart(int initSize) {
super(initSize);
}
@Override
public int[] init() {
final int[] ord = super.init();
termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length;
return ord;
}
@Override
public int[] grow() {
final int[] ord = super.grow();
if (termState.length < ord.length) {
TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState;
}
assert termState.length >= ord.length;
return ord;
}
@Override
public int[] clear() {
termState = null;
return super.clear();
}
}
}

View File

@ -49,8 +49,8 @@ import org.apache.lucene.util.AttributeSource;
* prevents this.
*
* <p>The recommended rewrite method is {@link
* #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}: it doesn't spend CPU
* computing unhelpful scores, and it tries to pick the most
* #CONSTANT_SCORE_FILTER_REWRITE}: it doesn't spend CPU
* computing unhelpful scores, and is the most
* performant rewrite method given the query. If you
* need scoring (like {@link FuzzyQuery}, use
* {@link TopTermsScoringBooleanQueryRewrite} which uses
@ -58,12 +58,12 @@ import org.apache.lucene.util.AttributeSource;
* and not hit this limitation.
*
* Note that org.apache.lucene.queryparser.classic.QueryParser produces
* MultiTermQueries using {@link
* #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} by default.
* MultiTermQueries using {@link #CONSTANT_SCORE_FILTER_REWRITE}
* by default.
*/
public abstract class MultiTermQuery extends Query {
protected final String field;
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_FILTER_REWRITE;
/** Abstract class that defines how the query is rewritten. */
public static abstract class RewriteMethod {
@ -104,7 +104,7 @@ public abstract class MultiTermQuery extends Query {
* query. Note that typically such scores are
* meaningless to the user, and require non-trivial CPU
* to compute, so it's almost always better to use {@link
* #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead.
* #CONSTANT_SCORE_FILTER_REWRITE} instead.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* BooleanQuery.TooManyClauses} if the number of terms
@ -208,40 +208,6 @@ public abstract class MultiTermQuery extends Query {
topLevel.add(q, BooleanClause.Occur.SHOULD);
}
}
/** A rewrite method that tries to pick the best
* constant-score rewrite method based on term and
* document counts from the query. If both the number of
* terms and documents is small enough, then {@link
* #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} is used.
* Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is
* used.
*/
public static class ConstantScoreAutoRewrite extends org.apache.lucene.search.ConstantScoreAutoRewrite {}
/** Read-only default instance of {@link
* ConstantScoreAutoRewrite}, with {@link
* ConstantScoreAutoRewrite#setTermCountCutoff} set to
* {@link
* ConstantScoreAutoRewrite#DEFAULT_TERM_COUNT_CUTOFF}
* and {@link
* ConstantScoreAutoRewrite#setDocCountPercent} set to
* {@link
* ConstantScoreAutoRewrite#DEFAULT_DOC_COUNT_PERCENT}.
* Note that you cannot alter the configuration of this
* instance; you'll need to create a private instance
* instead. */
public final static RewriteMethod CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new ConstantScoreAutoRewrite() {
@Override
public void setTermCountCutoff(int count) {
throw new UnsupportedOperationException("Please create a private instance");
}
@Override
public void setDocCountPercent(double percent) {
throw new UnsupportedOperationException("Please create a private instance");
}
};
/**
* Constructs a query matching terms that cannot be represented with a single

View File

@ -19,14 +19,14 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.DocIdSetBuilder;
/**
* A wrapper for {@link MultiTermQuery}, that exposes its
@ -43,7 +43,7 @@ import org.apache.lucene.util.Bits;
* this is why it is not abstract.
*/
public class MultiTermQueryWrapperFilter<Q extends MultiTermQuery> extends Filter {
protected final Q query;
/**
@ -52,7 +52,7 @@ public class MultiTermQueryWrapperFilter<Q extends MultiTermQuery> extends Filte
protected MultiTermQueryWrapperFilter(Q query) {
this.query = query;
}
@Override
public String toString() {
// query.toString should be ok for the filter, too, if the query boost is 1.0f
@ -77,7 +77,7 @@ public class MultiTermQueryWrapperFilter<Q extends MultiTermQuery> extends Filte
/** Returns the field name for this query */
public final String getField() { return query.getField(); }
/**
* Returns a DocIdSet with documents that should be permitted in search
* results.
@ -99,24 +99,13 @@ public class MultiTermQueryWrapperFilter<Q extends MultiTermQuery> extends Filte
final TermsEnum termsEnum = query.getTermsEnum(terms);
assert termsEnum != null;
if (termsEnum.next() != null) {
// fill into a FixedBitSet
final FixedBitSet bitSet = new FixedBitSet(context.reader().maxDoc());
DocsEnum docsEnum = null;
do {
// System.out.println(" iter termCount=" + termCount + " term=" +
// enumerator.term().toBytesString());
docsEnum = termsEnum.docs(acceptDocs, docsEnum, DocsEnum.FLAG_NONE);
int docid;
while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
bitSet.set(docid);
}
} while (termsEnum.next() != null);
// System.out.println(" done termCount=" + termCount);
return bitSet;
} else {
return null;
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
DocsEnum docs = null;
while (termsEnum.next() != null) {
docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE);
builder.or(docs);
}
return builder.build();
}
}

View File

@ -72,7 +72,7 @@ import org.apache.lucene.index.Term; // for javadocs
* details.
*
* <p>This query defaults to {@linkplain
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}.
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}.
* With precision steps of &le;4, this query can be run with
* one of the BooleanQuery rewrite methods without changing
* BooleanQuery's default max clause count.

View File

@ -29,7 +29,7 @@ import org.apache.lucene.util.ToStringUtils;
* is built by QueryParser for input like <code>app*</code>.
*
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}
* rewrite method. */
public class PrefixQuery extends MultiTermQuery {
private Term prefix;

View File

@ -45,7 +45,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
* query. Note that typically such scores are
* meaningless to the user, and require non-trivial CPU
* to compute, so it's almost always better to use {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead.
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} instead.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* BooleanQuery.TooManyClauses} if the number of terms

View File

@ -35,7 +35,7 @@ import org.apache.lucene.util.ToStringUtils;
* for numerical ranges; use {@link NumericRangeQuery} instead.
*
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}
* rewrite method.
* @since 2.9
*/

View File

@ -35,7 +35,7 @@ import org.apache.lucene.util.automaton.Automaton;
* a Wildcard term should not start with the wildcard <code>*</code>
*
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}
* rewrite method.
*
* @see AutomatonQuery

View File

@ -0,0 +1,94 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/**
* A builder of {@link DocIdSet}s that supports random access.
* @lucene.internal
*/
public final class DocIdSetBuilder {
private final int maxDoc;
private final int threshold;
private SparseFixedBitSet sparseSet;
private FixedBitSet denseSet;
// we cache an upper bound of the cost of this builder so that we don't have
// to re-compute approximateCardinality on the sparse set every time
private long costUpperBound;
/** Sole constructor. */
public DocIdSetBuilder(int maxDoc) {
this.maxDoc = maxDoc;
threshold = maxDoc >>> 14;
}
/**
* Add the content of the provided {@link DocIdSetIterator} to this builder.
*/
public void or(DocIdSetIterator it) throws IOException {
if (denseSet != null) {
// already upgraded
denseSet.or(it);
return;
}
final long itCost = it.cost();
costUpperBound += itCost;
if (costUpperBound >= threshold) {
costUpperBound = (sparseSet == null ? 0 : sparseSet.approximateCardinality()) + itCost;
if (costUpperBound >= threshold) {
// upgrade
denseSet = new FixedBitSet(maxDoc);
denseSet.or(it);
if (sparseSet != null) {
denseSet.or(sparseSet.iterator());
}
return;
}
}
// we are still sparse
if (sparseSet == null) {
sparseSet = new SparseFixedBitSet(maxDoc);
}
sparseSet.or(it);
}
/**
* Build a {@link DocIdSet} that contains all doc ids that have been added.
* This method may return <tt>null</tt> if no documents were addded to this
* builder.
* NOTE: this is a destructive operation, the builder should not be used
* anymore after this method has been called.
*/
public DocIdSet build() {
final DocIdSet result = denseSet != null ? denseSet : sparseSet;
denseSet = null;
sparseSet = null;
costUpperBound = 0;
return result;
}
}

View File

@ -0,0 +1,341 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/**
* A bit set that only stores longs that have at least one bit which is set.
* The way it works is that the space of bits is divided into blocks of
* 4096 bits, which is 64 longs. Then for each block, we have:<ul>
* <li>a long[] which stores the non-zero longs for that block</li>
* <li>a long so that bit <tt>i</tt> being set means that the <code>i-th</code>
* long of the block is non-null, and its offset in the array of longs is
* the number of one bits on the right of the <code>i-th</code> bit.</li></ul>
*
* @lucene.internal
*/
public class SparseFixedBitSet extends DocIdSet implements Bits {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SparseFixedBitSet.class);
private static final long SINGLE_ELEMENT_ARRAY_BYTES_USED = RamUsageEstimator.sizeOf(new long[1]);
private static int blockCount(int length) {
int blockCount = length >>> 12;
if ((blockCount << 12) < length) {
++blockCount;
}
assert (blockCount << 12) >= length;
return blockCount;
}
final long[] indices;
final long[][] bits;
final int length;
int nonZeroLongCount;
long ramBytesUsed;
/** Create a {@link SparseFixedBitSet} that can contain bits between
* <code>0</code> included and <code>length</code> excluded. */
public SparseFixedBitSet(int length) {
if (length < 1) {
throw new IllegalArgumentException("length needs to be >= 1");
}
this.length = length;
final int blockCount = blockCount(length);
indices = new long[blockCount];
bits = new long[blockCount][];
ramBytesUsed = BASE_RAM_BYTES_USED
+ RamUsageEstimator.shallowSizeOf(indices)
+ RamUsageEstimator.shallowSizeOf(bits);
}
@Override
public boolean isCacheable() {
return true;
}
@Override
public int length() {
return length;
}
@Override
public Bits bits() throws IOException {
return this;
}
private boolean consistent(int index) {
assert index >= 0 && index < length : "index=" + index + ",length=" + length;
return true;
}
/**
* Compute the cardinality of this set.
* NOTE: this operation runs in linear time.
*/
public int cardinality() {
int cardinality = 0;
for (long[] bitArray : bits) {
if (bitArray != null) {
for (long bits : bitArray) {
cardinality += Long.bitCount(bits);
}
}
}
return cardinality;
}
/**
* Return an approximation of the cardinality of this set, assuming that bits
* are uniformly distributed. This operation runs in constant time.
*/
public int approximateCardinality() {
// this is basically the linear counting algorithm
final int totalLongs = (length + 63) >>> 6; // total number of longs in the space
assert totalLongs >= nonZeroLongCount;
final int zeroLongs = totalLongs - nonZeroLongCount; // number of longs that are zeros
// No need to guard against division by zero, it will return +Infinity and things will work as expected
final long estimate = Math.round(totalLongs * Math.log((double) totalLongs / zeroLongs));
return (int) Math.min(length, estimate);
}
@Override
public boolean get(int i) {
assert consistent(i);
final int i4096 = i >>> 12;
final long index = indices[i4096];
final int i64 = i >>> 6;
// first check the index, if the i64-th bit is not set, then i is not set
// note: this relies on the fact that shifts are mod 64 in java
if ((index & (1L << i64)) == 0) {
return false;
}
// if it is set, then we count the number of bits that are set on the right
// of i64, and that gives us the index of the long that stores the bits we
// are interested in
final long bits = this.bits[i4096][Long.bitCount(index & ((1L << i64) - 1))];
return (bits & (1L << i)) != 0;
}
private static int oversize(int s) {
int newSize = s + (s >>> 1);
if (newSize > 50) {
newSize = 64;
}
return newSize;
}
/**
* Set the bit at index <tt>i</tt>.
*/
public void set(int i) {
assert consistent(i);
final int i4096 = i >>> 12;
final long index = indices[i4096];
final int i64 = i >>> 6;
if (index == 0) {
// if the index is 0, it means that we just found a block of 4096 bits
// that has no bit that is set yet. So let's initialize a new block:
indices[i4096] = 1L << i64; // shifts are mod 64 in java
assert bits[i4096] == null;
bits[i4096] = new long[] { 1L << i }; // shifts are mod 64 in java
++nonZeroLongCount;
ramBytesUsed += SINGLE_ELEMENT_ARRAY_BYTES_USED;
} else if ((index & (1L << i64)) == 0) {
// in that case we found a block of 4096 bits that has some values, but
// the sub-block of 64 bits that we are interested in has no value yet,
// so we need to insert a new long
insertLong(i4096, i64, i, index);
} else {
// in that case the sub 64-bits block we are interested in already exists,
// we just need to set a bit in an existing long: the number of ones on
// the right of i64 gives us the index of the long we need to update
bits[i4096][Long.bitCount(index & ((1L << i64) - 1))] |= 1L << i; // shifts are mod 64 in java
}
}
private void insertLong(int i4096, int i64, int i, long index) {
indices[i4096] |= 1L << i64; // shifts are mod 64 in java
// we count the number of bits that are set on the right of i64
// this gives us the index at which to perform the insertion
final int o = Long.bitCount(index & ((1L << i64) - 1));
final long[] bitArray = bits[i4096];
if (bitArray[bitArray.length - 1] == 0) {
// since we only store non-zero longs, if the last value is 0, it means
// that we alreay have extra space, make use of it
System.arraycopy(bitArray, o, bitArray, o + 1, bitArray.length - o - 1);
bitArray[o] = 1L << i;
} else {
// we don't have extra space so we need to resize to insert the new long
final int newSize = oversize(bitArray.length + 1);
final long[] newBitArray = new long[newSize];
System.arraycopy(bitArray, 0, newBitArray, 0, o);
newBitArray[o] = 1L << i;
System.arraycopy(bitArray, o, newBitArray, o + 1, bitArray.length - o);
bits[i4096] = newBitArray;
ramBytesUsed += (newSize - bitArray.length) * RamUsageEstimator.NUM_BYTES_LONG;
}
++nonZeroLongCount;
}
/**
* Add the documents contained in the provided {@link DocIdSetIterator} to
* this bit set.
*/
public void or(DocIdSetIterator it) throws IOException {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
set(doc);
}
}
@Override
public long ramBytesUsed() {
return ramBytesUsed;
}
@Override
public DocIdSetIterator iterator() throws IOException {
return new Iterator();
}
class Iterator extends DocIdSetIterator {
private int doc = -1;
private int cost = -1;
@Override
public int docID() {
return doc;
}
/** Return the first document that occurs on or after the provided block index. */
private int firstDoc(int i4096) {
long index = 0;
while (i4096 < indices.length) {
index = indices[i4096];
if (index != 0) {
final int i64 = Long.numberOfTrailingZeros(index);
return doc = (i4096 << 12) | (i64 << 6) | Long.numberOfTrailingZeros(bits[i4096][0]);
}
i4096 += 1;
}
return doc = NO_MORE_DOCS;
}
@Override
public int nextDoc() throws IOException {
if (doc == NO_MORE_DOCS || ++doc >= length) {
return doc = NO_MORE_DOCS;
}
return currentOrNextDoc();
}
private int currentOrNextDoc() {
final int i4096 = doc >>> 12;
final long index = indices[i4096];
if (index == 0) {
// if the index is zero, it means that there is no value in the
// current block, so return the first document of the next block
return firstDoc(i4096 + 1);
} else {
// now we are on a block that contains at least one document
assert Long.bitCount(index) <= bits[i4096].length;
int i64 = doc >>> 6;
long indexBits = index >>> i64; // shifts are mod 64 in java
if (indexBits == 0) {
// if neither the i64-th bit or any other bit on its left is set then
// it means that there are no more documents in this block, go to the
// next one
return firstDoc(i4096 + 1);
}
// We know we still have some 64-bits blocks that have bits set, let's
// advance to the next one by skipping trailing zeros of the index
int i1 = doc & 0x3F;
int trailingZeros = Long.numberOfTrailingZeros(indexBits);
if (trailingZeros != 0) {
// no bits in the current long, go to the next one
i64 += trailingZeros;
i1 = 0;
}
// So now we are on a sub 64-bits block that has values
assert (index & (1L << i64)) != 0;
// we count the number of ones on the left of i64 to figure out the
// index of the long that contains the bits we are interested in
int longIndex = Long.bitCount(index & ((1L << i64) - 1)); // shifts are mod 64 in java
final long[] longArray = bits[i4096];
assert longArray[longIndex] != 0;
long bits = SparseFixedBitSet.this.bits[i4096][longIndex] >>> i1; // shifts are mod 64 in java
if (bits != 0L) {
// hurray, we found some non-zero bits, this gives us the next document:
i1 += Long.numberOfTrailingZeros(bits);
return doc = (i4096 << 12) | ((i64 & 0x3F) << 6) | i1;
}
// otherwise it means that although we were on a sub-64 block that contains
// documents, all documents of this sub-block have already been consumed
// so two cases:
indexBits = index >>> i64 >>> 1; // we don't shift by (i64+1) otherwise we might shift by a multiple of 64 which is a no-op
if (indexBits == 0) {
// Case 1: this was the last long of the block of 4096 bits, then go
// to the next block
return firstDoc(i4096 + 1);
}
// Case 2: go to the next sub 64-bits block in the current block of 4096 bits
// by skipping trailing zeros of the index
trailingZeros = Long.numberOfTrailingZeros(indexBits);
i64 += 1 + trailingZeros;
bits = longArray[longIndex + 1];
assert bits != 0;
i1 = Long.numberOfTrailingZeros(bits);
return doc = (i4096 << 12) | ((i64 & 0x3F) << 6) | i1;
}
}
@Override
public int advance(int target) throws IOException {
if (target >= length) {
return doc = NO_MORE_DOCS;
} else {
doc = target;
}
return currentOrNextDoc();
}
@Override
public long cost() {
// although constant-time, approximateCardinality is a bit expensive so
// we cache it to avoid performance traps eg. when sorting iterators by
// cost
if (cost < 0) {
cost = approximateCardinality();
}
assert cost >= 0;
return cost;
}
}
}

View File

@ -17,6 +17,8 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.util.concurrent.ExecutionException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
@ -81,7 +83,14 @@ public class TestReaderClosed extends LuceneTestCase {
reader.close(); // close original child reader
try {
searcher.search(query, 5);
} catch (AlreadyClosedException ace) {
} catch (Exception e) {
AlreadyClosedException ace = null;
for (Throwable t = e; t != null; t = t.getCause()) {
if (t instanceof AlreadyClosedException) {
ace = (AlreadyClosedException) t;
}
}
assertNotNull("Query failed, but not due to an AlreadyClosedException", ace);
assertEquals(
"this IndexReader cannot be used anymore as one of its child readers was closed",
ace.getMessage()

View File

@ -98,9 +98,6 @@ public class TestAutomatonQuery extends LuceneTestCase {
query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
assertEquals(expected, automatonQueryNrHits(query));
query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
assertEquals(expected, automatonQueryNrHits(query));
}
/**

View File

@ -110,9 +110,6 @@ public class TestAutomatonQueryUnicode extends LuceneTestCase {
query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
assertEquals(expected, automatonQueryNrHits(query));
query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
assertEquals(expected, automatonQueryNrHits(query));
}
/**

View File

@ -160,7 +160,7 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
result[i].score, SCORE_COMP_THRESH);
}
result = search.search(csrq("data", "1", "6", T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, 1000).scoreDocs;
result = search.search(csrq("data", "1", "6", T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, 1000).scoreDocs;
numHits = result.length;
assertEquals("wrong number of results", 6, numHits);
for (int i = 0; i < numHits; i++) {
@ -204,7 +204,7 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
bq = new BooleanQuery();
bq.add(dummyTerm, BooleanClause.Occur.SHOULD); // hits one doc
bq.add(csrq("data", "#", "#", T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), BooleanClause.Occur.SHOULD); // hits no docs
bq.add(csrq("data", "#", "#", T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), BooleanClause.Occur.SHOULD); // hits no docs
result = search.search(bq, null, 1000).scoreDocs;
numHits = result.length;
assertEquals("wrong number of results", 1, numHits);
@ -346,37 +346,37 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
result = search.search(csrq("id", minIP, maxIP, T, T), null, numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
result = search.search(csrq("id", minIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", minIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
result = search.search(csrq("id", minIP, maxIP, T, F), null, numDocs).scoreDocs;
assertEquals("all but last", numDocs - 1, result.length);
result = search.search(csrq("id", minIP, maxIP, T, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", minIP, maxIP, T, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("all but last", numDocs - 1, result.length);
result = search.search(csrq("id", minIP, maxIP, F, T), null, numDocs).scoreDocs;
assertEquals("all but first", numDocs - 1, result.length);
result = search.search(csrq("id", minIP, maxIP, F, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", minIP, maxIP, F, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("all but first", numDocs - 1, result.length);
result = search.search(csrq("id", minIP, maxIP, F, F), null, numDocs).scoreDocs;
assertEquals("all but ends", numDocs - 2, result.length);
result = search.search(csrq("id", minIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", minIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("all but ends", numDocs - 2, result.length);
result = search.search(csrq("id", medIP, maxIP, T, T), null, numDocs).scoreDocs;
assertEquals("med and up", 1 + maxId - medId, result.length);
result = search.search(csrq("id", medIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", medIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("med and up", 1 + maxId - medId, result.length);
result = search.search(csrq("id", minIP, medIP, T, T), null, numDocs).scoreDocs;
assertEquals("up to med", 1 + medId - minId, result.length);
result = search.search(csrq("id", minIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", minIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("up to med", 1 + medId - minId, result.length);
// unbounded id
@ -404,49 +404,49 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
result = search.search(csrq("id", minIP, minIP, F, F), null, numDocs).scoreDocs;
assertEquals("min,min,F,F", 0, result.length);
result = search.search(csrq("id", minIP, minIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", minIP, minIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("min,min,F,F", 0, result.length);
result = search.search(csrq("id", medIP, medIP, F, F), null, numDocs).scoreDocs;
assertEquals("med,med,F,F", 0, result.length);
result = search.search(csrq("id", medIP, medIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", medIP, medIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("med,med,F,F", 0, result.length);
result = search.search(csrq("id", maxIP, maxIP, F, F), null, numDocs).scoreDocs;
assertEquals("max,max,F,F", 0, result.length);
result = search.search(csrq("id", maxIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", maxIP, maxIP, F, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("max,max,F,F", 0, result.length);
result = search.search(csrq("id", minIP, minIP, T, T), null, numDocs).scoreDocs;
assertEquals("min,min,T,T", 1, result.length);
result = search.search(csrq("id", minIP, minIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", minIP, minIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("min,min,T,T", 1, result.length);
result = search.search(csrq("id", null, minIP, F, T), null, numDocs).scoreDocs;
assertEquals("nul,min,F,T", 1, result.length);
result = search.search(csrq("id", null, minIP, F, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", null, minIP, F, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("nul,min,F,T", 1, result.length);
result = search.search(csrq("id", maxIP, maxIP, T, T), null, numDocs).scoreDocs;
assertEquals("max,max,T,T", 1, result.length);
result = search.search(csrq("id", maxIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", maxIP, maxIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("max,max,T,T", 1, result.length);
result = search.search(csrq("id", maxIP, null, T, F), null, numDocs).scoreDocs;
assertEquals("max,nul,T,T", 1, result.length);
result = search.search(csrq("id", maxIP, null, T, F, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", maxIP, null, T, F, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("max,nul,T,T", 1, result.length);
result = search.search(csrq("id", medIP, medIP, T, T), null, numDocs).scoreDocs;
assertEquals("med,med,T,T", 1, result.length);
result = search.search(csrq("id", medIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT), null, numDocs).scoreDocs;
result = search.search(csrq("id", medIP, medIP, T, T, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE), null, numDocs).scoreDocs;
assertEquals("med,med,T,T", 1, result.length);
}

View File

@ -138,12 +138,6 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
// use a large PQ here to only test duplicate terms and dont mix up when all scores are equal
checkDuplicateTerms(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024));
checkDuplicateTerms(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(1024));
// Test auto rewrite (but only boolean mode), so we set the limits to large values to always get a BQ
final MultiTermQuery.ConstantScoreAutoRewrite rewrite = new MultiTermQuery.ConstantScoreAutoRewrite();
rewrite.setTermCountCutoff(Integer.MAX_VALUE);
rewrite.setDocCountPercent(100.);
checkDuplicateTerms(rewrite);
}
private void checkBooleanQueryBoosts(BooleanQuery bq) {
@ -235,7 +229,6 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
checkMaxClauseLimitation(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
checkNoMaxClauseLimitation(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
checkNoMaxClauseLimitation(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
checkNoMaxClauseLimitation(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024));
checkNoMaxClauseLimitation(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(1024));
}

View File

@ -87,12 +87,6 @@ public class TestWildcard
assertTrue(q instanceof ConstantScoreQuery);
assertEquals(q.getBoost(), wq.getBoost(), 0.1);
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
wq.setBoost(0.3F);
q = searcher.rewrite(wq);
assertTrue(q instanceof ConstantScoreQuery);
assertEquals(q.getBoost(), wq.getBoost(), 0.1);
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
wq.setBoost(0.4F);
q = searcher.rewrite(wq);

View File

@ -0,0 +1,92 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
public class TestDocIdSetBuilder extends LuceneTestCase {
public void testEmpty() throws IOException {
assertEquals(null, new DocIdSetBuilder(1 + random().nextInt(1000)).build());
}
private void assertEquals(DocIdSet d1, DocIdSet d2) throws IOException {
if (d1 == null) {
if (d2 != null) {
assertEquals(DocIdSetIterator.NO_MORE_DOCS, d2.iterator().nextDoc());
}
} else if (d2 == null) {
assertEquals(DocIdSetIterator.NO_MORE_DOCS, d1.iterator().nextDoc());
} else {
DocIdSetIterator i1 = d1.iterator();
DocIdSetIterator i2 = d2.iterator();
for (int doc = i1.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = i1.nextDoc()) {
assertEquals(doc, i2.nextDoc());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, i2.nextDoc());
}
}
public void testSparse() throws IOException {
final int maxDoc = 1000000 + random().nextInt(1000000);
DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc);
final int numIterators = 1 + random().nextInt(10);
final FixedBitSet ref = new FixedBitSet(maxDoc);
for (int i = 0; i < numIterators; ++i) {
final int baseInc = 200000 + random().nextInt(10000);
WAH8DocIdSet.Builder b = new WAH8DocIdSet.Builder();
for (int doc = random().nextInt(100); doc < maxDoc; doc += baseInc + random().nextInt(10000)) {
b.add(doc);
ref.set(doc);
}
builder.or(b.build().iterator());
}
DocIdSet result = builder.build();
assertTrue(result instanceof SparseFixedBitSet);
assertEquals(ref, result);
}
public void testDense() throws IOException {
final int maxDoc = 1000000 + random().nextInt(1000000);
DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc);
final int numIterators = 1 + random().nextInt(10);
final FixedBitSet ref = new FixedBitSet(maxDoc);
if (random().nextBoolean()) {
// try upgrades
final int doc = random().nextInt(maxDoc);
ref.set(doc);
builder.or(new WAH8DocIdSet.Builder().add(doc).build().iterator());
}
for (int i = 0; i < numIterators; ++i) {
final int baseInc = 2 + random().nextInt(10000);
WAH8DocIdSet.Builder b = new WAH8DocIdSet.Builder();
for (int doc = random().nextInt(10000); doc < maxDoc; doc += baseInc + random().nextInt(2000)) {
b.add(doc);
ref.set(doc);
}
builder.or(b.build().iterator());
}
DocIdSet result = builder.build();
assertTrue(result instanceof FixedBitSet);
assertEquals(ref, result);
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.List;
public class TestSparseFixedBitSet extends BaseDocIdSetTestCase<SparseFixedBitSet> {
@Override
public SparseFixedBitSet copyOf(BitSet bs, int length) throws IOException {
final SparseFixedBitSet set = new SparseFixedBitSet(length);
// SparseFixedBitSet can be sensitive to the order of insertion so
// randomize insertion a bit
List<Integer> buffer = new ArrayList<>();
for (int doc = bs.nextSetBit(0); doc != -1; doc = bs.nextSetBit(doc + 1)) {
buffer.add(doc);
if (buffer.size() >= 100000) {
Collections.shuffle(buffer);
for (int i : buffer) {
set.set(i);
}
buffer.clear();
}
}
Collections.shuffle(buffer);
for (int i : buffer) {
set.set(i);
}
return set;
}
@Override
public void assertEquals(int numBits, BitSet ds1, SparseFixedBitSet ds2) throws IOException {
for (int i = 0; i < numBits; ++i) {
assertEquals(ds1.get(i), ds2.get(i));
}
assertEquals(ds1.cardinality(), ds2.cardinality());
super.assertEquals(numBits, ds1, ds2);
}
public void testApproximateCardinality() {
final SparseFixedBitSet set = new SparseFixedBitSet(10000);
final int first = random().nextInt(1000);
final int interval = 200 + random().nextInt(1000);
for (int i = first; i < set.length(); i += interval) {
set.set(i);
}
assertEquals(set.cardinality(), set.approximateCardinality(), 20);
}
public void testApproximateCardinalityOnDenseSet() {
// this tests that things work as expected in approximateCardinality when
// all longs are different than 0, in which case we divide by zero
final int numDocs = 70;//TestUtil.nextInt(random(), 1, 10000);
final SparseFixedBitSet set = new SparseFixedBitSet(numDocs);
for (int i = 0; i < set.length(); ++i) {
set.set(i);
}
assertEquals(numDocs, set.approximateCardinality());
}
}

View File

@ -17,15 +17,6 @@ package org.apache.lucene.queries;
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
@ -33,6 +24,20 @@ import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.DocIdSetBuilder;
/**
* Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
@ -178,11 +183,11 @@ public final class TermsFilter extends Filter {
@Override
public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
final LeafReader reader = context.reader();
FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
final Fields fields = reader.fields();
final BytesRef spare = new BytesRef(this.termsBytes);
if (fields == null) {
return result;
return builder.build();
}
Terms terms = null;
TermsEnum termsEnum = null;
@ -195,21 +200,12 @@ public final class TermsFilter extends Filter {
spare.length = offsets[i+1] - offsets[i];
if (termsEnum.seekExact(spare)) {
docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); // no freq since we don't need them
if (result == null) {
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result = new FixedBitSet(reader.maxDoc());
// lazy init but don't do it in the hot loop since we could read many docs
result.set(docs.docID());
}
}
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result.set(docs.docID());
}
builder.or(docs);
}
}
}
}
return result;
return builder.build();
}
@Override

View File

@ -28,19 +28,19 @@ import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;

View File

@ -61,7 +61,7 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
Operator operator = OR_OPERATOR;
boolean lowercaseExpandedTerms = true;
MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE;
boolean allowLeadingWildcard = false;
protected String field;
@ -274,7 +274,7 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
}
/**
* By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}
* when creating a {@link PrefixQuery}, {@link WildcardQuery} or {@link TermRangeQuery}. This implementation is generally preferable because it
* a) Runs faster b) Does not have the scarcity of terms unduly influence score
* c) avoids any {@link TooManyClauses} exception.

View File

@ -73,7 +73,7 @@ public interface CommonQueryParserConfiguration {
/**
* By default, it uses
* {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} when creating a
* {@link MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} when creating a
* prefix, wildcard and range queries. This implementation is generally
* preferable because it a) Runs faster b) Does not have the scarcity of terms
* unduly influence score c) avoids any {@link TooManyListenersException}

View File

@ -265,7 +265,7 @@ public class StandardQueryParser extends QueryParserHelper implements CommonQuer
/**
* By default, it uses
* {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} when creating a
* {@link MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} when creating a
* prefix, wildcard and range queries. This implementation is generally
* preferable because it a) Runs faster b) Does not have the scarcity of terms
* unduly influence score c) avoids any {@link TooManyListenersException}

View File

@ -208,7 +208,7 @@ public class StandardQueryConfigHandler extends QueryConfigHandler {
set(ConfigurationKeys.FIELD_BOOST_MAP, new LinkedHashMap<String, Float>());
set(ConfigurationKeys.FUZZY_CONFIG, new FuzzyConfig());
set(ConfigurationKeys.LOCALE, Locale.getDefault());
set(ConfigurationKeys.MULTI_TERM_REWRITE_METHOD, MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
set(ConfigurationKeys.MULTI_TERM_REWRITE_METHOD, MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
set(ConfigurationKeys.FIELD_DATE_RESOLUTION_MAP, new HashMap<CharSequence, DateTools.Resolution>());
}

View File

@ -30,7 +30,7 @@ import org.apache.lucene.search.MultiTermQuery;
/**
* This processor instates the default
* {@link org.apache.lucene.search.MultiTermQuery.RewriteMethod},
* {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}, for multi-term
* {@link MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}, for multi-term
* query nodes.
*/
public class MultiTermRewriteMethodProcessor extends QueryNodeProcessorImpl {

View File

@ -321,15 +321,15 @@ public class TestQPHelper extends LuceneTestCase {
StandardQueryParser qp = new StandardQueryParser(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
Query q = qp.parse("foo*bar", "field");
assertTrue(q instanceof WildcardQuery);
assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((MultiTermQuery) q).getRewriteMethod());
assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((MultiTermQuery) q).getRewriteMethod());
q = qp.parse("foo*", "field");
assertTrue(q instanceof PrefixQuery);
assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((MultiTermQuery) q).getRewriteMethod());
assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((MultiTermQuery) q).getRewriteMethod());
q = qp.parse("[a TO z]", "field");
assertTrue(q instanceof TermRangeQuery);
assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((MultiTermQuery) q).getRewriteMethod());
assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((MultiTermQuery) q).getRewriteMethod());
}
public void testCJK() throws Exception {
@ -659,7 +659,7 @@ public class TestQPHelper extends LuceneTestCase {
public void testRange() throws Exception {
assertQueryEquals("[ a TO z]", null, "[a TO z]");
assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((TermRangeQuery)getQuery("[ a TO z]", null)).getRewriteMethod());
assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((TermRangeQuery)getQuery("[ a TO z]", null)).getRewriteMethod());
StandardQueryParser qp = new StandardQueryParser();
@ -1152,7 +1152,7 @@ public class TestQPHelper extends LuceneTestCase {
assertTrue(qp.parse("/[A-Z][123]/^0.5", df) instanceof RegexpQuery);
assertEquals(q, qp.parse("/[A-Z][123]/^0.5", df));
assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE, ((RegexpQuery)qp.parse("/[A-Z][123]/^0.5", df)).getRewriteMethod());
qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
Query escaped = new RegexpQuery(new Term("field", "[a-z]\\/[123]"));
assertEquals(escaped, qp.parse("/[a-z]\\/[123]/", df));

View File

@ -571,7 +571,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertQueryEquals("[ a TO z}", null, "[a TO z}");
assertQueryEquals("{ a TO z]", null, "{a TO z]");
assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((TermRangeQuery)getQuery("[ a TO z]")).getRewriteMethod());
assertEquals(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE, ((TermRangeQuery)getQuery("[ a TO z]")).getRewriteMethod());
CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
@ -987,7 +987,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertTrue(getQuery("/[A-Z][123]/^0.5",qp) instanceof RegexpQuery);
assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE, ((RegexpQuery)getQuery("/[A-Z][123]/^0.5",qp)).getRewriteMethod());
assertEquals(q, getQuery("/[A-Z][123]/^0.5",qp));
qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
qp.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
Query escaped = new RegexpQuery(new Term("field", "[a-z]\\/[123]"));
assertEquals(escaped, getQuery("/[a-z]\\/[123]/",qp));

View File

@ -16,15 +16,21 @@ package org.apache.lucene.sandbox.queries;
* limitations under the License.
*/
import org.apache.lucene.index.*;
import java.io.IOException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import org.apache.lucene.util.SparseFixedBitSet;
/**
* Filter to remove duplicate values from search results.
@ -87,8 +93,8 @@ public class DuplicateFilter extends Filter {
}
}
private FixedBitSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
private SparseFixedBitSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException {
SparseFixedBitSet bits = new SparseFixedBitSet(reader.maxDoc()); //assume all are INvalid
Terms terms = reader.fields().terms(fieldName);
if (terms == null) {

View File

@ -36,7 +36,7 @@ import org.apache.lucene.util.ToStringUtils;
* for numerical ranges; use {@link NumericRangeQuery} instead.
*
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}
* rewrite method.
* @deprecated Index collation keys with CollationKeyAnalyzer or ICUCollationKeyAnalyzer instead.
* This class will be removed in Lucene 5.0

View File

@ -57,7 +57,7 @@ public abstract class BaseDocIdSetTestCase<T extends DocIdSet> extends LuceneTes
/** Test length=0. */
public void testNoBit() throws IOException {
final BitSet bs = new BitSet(1);
final T copy = copyOf(bs, 0);
final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000));
assertEquals(0, bs, copy);
}
@ -67,7 +67,7 @@ public abstract class BaseDocIdSetTestCase<T extends DocIdSet> extends LuceneTes
if (random().nextBoolean()) {
bs.set(0);
}
final T copy = copyOf(bs, 1);
final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000));
assertEquals(1, bs, copy);
}
@ -80,7 +80,7 @@ public abstract class BaseDocIdSetTestCase<T extends DocIdSet> extends LuceneTes
if (random().nextBoolean()) {
bs.set(1);
}
final T copy = copyOf(bs, 2);
final T copy = copyOf(bs, TestUtil.nextInt(random(), 1, 10000));
assertEquals(2, bs, copy);
}

View File

@ -78,7 +78,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
/** The default operator that parser uses to combine query terms */
Operator operator = OR_OPERATOR;
MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE;
boolean allowLeadingWildcard = true;
String defaultField;
@ -294,7 +294,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
/**
* By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
* By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}
* when creating a PrefixQuery, WildcardQuery or RangeQuery. This implementation is generally preferable because it
* a) Runs faster b) Does not have the scarcity of terms unduly influence score
* c) avoids any "TooManyBooleanClauses" exception.

View File

@ -746,7 +746,7 @@ public abstract class FieldType extends FieldProperties {
if (!field.indexed() && field.hasDocValues()) {
return field.multiValued() ? new DocTermOrdsRewriteMethod() : new DocValuesRewriteMethod();
} else {
return MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE;
}
}

View File

@ -5,6 +5,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.join.FixedBitSetCachingWrapperFilter;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.solr.SolrTestCaseJ4;
@ -34,6 +35,7 @@ import javax.xml.stream.XMLStreamReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -566,8 +568,8 @@ public class AddBlockUpdateTest extends SolrTestCaseJ4 {
protected ToParentBlockJoinQuery join(final String childTerm) {
return new ToParentBlockJoinQuery(
new TermQuery(new Term(child, childTerm)), new TermRangeFilter(parent,
null, null, false, false), ScoreMode.None);
new TermQuery(new Term(child, childTerm)), new FixedBitSetCachingWrapperFilter(new TermRangeFilter(parent,
null, null, false, false)), ScoreMode.None);
}
private Collection<? extends Callable<Void>> callables(List<Document> blocks) {