LUCENE-2690: MultiTermQuery boolean rewrites per segment

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1022934 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-10-15 14:25:48 +00:00
parent 2a8c9dfa3f
commit dd1c7a8585
21 changed files with 676 additions and 191 deletions

View File

@ -147,8 +147,10 @@ API Changes
you also override this method on upgrade. (Robert Muir, Mike
McCandless)
* LUCENE-2691: IndexWriter.getReader() has been made package local and is now exposed via open and reopen methods on
IndexReader. The semantics of the call is the same as it was prior to the API change. (Grant Ingersoll, Mike McCandless)
* LUCENE-2691: IndexWriter.getReader() has been made package local and is now
exposed via open and reopen methods on IndexReader. The semantics of the
call is the same as it was prior to the API change.
(Grant Ingersoll, Mike McCandless)
New features
@ -265,6 +267,9 @@ New features
* LUCENE-2692: Added several new SpanQuery classes for positional checking
(match is in a range, payload is a specific value) (Grant Ingersoll)
* LUCENE-2690: MultiTermQuery boolean rewrites per segment.
(Uwe Schindler, Robert Muir, Mike McCandless)
Optimizations
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.

View File

@ -316,3 +316,11 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
* LUCENE-2691: The near-real-time API has moved from IndexWriter to
IndexReader. Instead of IndexWriter.getReader(), call
IndexReader.open(IndexWriter) or IndexReader.reopen(IndexWriter).
* LUCENE-2690: MultiTermQuery boolean rewrites per segment.
Also MultiTermQuery.getTermsEnum() now takes an AttributeSource. FuzzyTermsEnum
is both consumer and producer of attributes: MTQ.BoostAttribute is
added to the FuzzyTermsEnum and MTQ's rewrite mode consumes it.
The other way round MTQ.TopTermsBooleanQueryRewrite supplys a
global AttributeSource to each segments TermsEnum. The TermsEnum is consumer
and gets the current minimum competitive boosts (MTQ.MaxNonCompetitiveBoostAttribute).

View File

@ -886,7 +886,7 @@ public class MemoryIndex implements Serializable {
@Override
public int docFreq() {
return info.sortedTerms[termUpto].getValue().size();
return 1;
}
@Override

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
@ -199,7 +200,10 @@ public class FuzzyLikeThisQuery extends Query
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore=0;
Term startTerm=internSavingTemplateTerm.createTerm(term);
FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
AttributeSource atts = new AttributeSource();
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, atts, startTerm, f.minSimilarity, f.prefixLength);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants=0;
@ -217,7 +221,7 @@ public class FuzzyLikeThisQuery extends Query
variantsQ.insertWithOverflow(st);
minScore = variantsQ.top().score; // maintain minScore
}
boostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
}

View File

@ -21,6 +21,7 @@ import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.FilteredTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
@ -60,7 +61,7 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
}
@Override
protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException {
protected FilteredTermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
return new RegexTermsEnum(reader, term, regexImpl);
}

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.LuceneTestCase;
public class TestRegexQuery extends LuceneTestCase {
@ -78,7 +79,7 @@ public class TestRegexQuery extends LuceneTestCase {
}
public void testMatchAll() throws Exception {
TermsEnum terms = new RegexQuery(new Term(FN, "jum.")).getTermsEnum(searcher.getIndexReader());
TermsEnum terms = new RegexQuery(new Term(FN, "jum.")).getTermsEnum(searcher.getIndexReader(), new AttributeSource() /*dummy*/);
// no term should match
assertNull(terms.next());
}

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@ -387,7 +388,10 @@ public class DirectSpellChecker {
private Collection<ScoreTerm> suggestSimilar(Term term, int numSug,
IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
FuzzyTermsEnum e = new FuzzyTermsEnum(ir, term, editDistance, Math.max(minPrefix, editDistance-1));
AttributeSource atts = new AttributeSource();
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum e = new FuzzyTermsEnum(ir, atts, term, editDistance, Math.max(minPrefix, editDistance-1));
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
BytesRef queryTerm = new BytesRef(term.text());
@ -435,7 +439,7 @@ public class DirectSpellChecker {
stQueue.offer(st);
// possibly drop entries from queue
st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
boostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
}
return stQueue;

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
@ -85,7 +86,7 @@ public class AutomatonQuery extends MultiTermQuery {
}
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
// matches nothing
if (BasicOperations.isEmpty(automaton)) {
return TermsEnum.EMPTY;

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@ -135,11 +136,11 @@ public class FuzzyQuery extends MultiTermQuery {
}
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
if (!termLongEnough) { // can only match if it's exact
return new SingleTermsEnum(reader, term);
}
return new FuzzyTermsEnum(reader, getTerm(), minimumSimilarity, prefixLength);
return new FuzzyTermsEnum(reader, atts, getTerm(), minimumSimilarity, prefixLength);
}
/**

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
@ -51,7 +52,12 @@ public final class FuzzyTermsEnum extends TermsEnum {
private final MultiTermQuery.BoostAttribute boostAtt =
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
private float bottom = boostAtt.getMaxNonCompetitiveBoost();
private final MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt;
private float bottom;
private BytesRef bottomTerm;
// nocommit: chicken-and-egg
private final Comparator<BytesRef> termComparator = BytesRef.getUTF8SortedAsUnicodeComparator();
private final float minSimilarity;
private final float scale_factor;
@ -82,7 +88,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
* @param prefixLength Length of required common prefix. Default value is 0.
* @throws IOException
*/
public FuzzyTermsEnum(IndexReader reader, Term term,
public FuzzyTermsEnum(IndexReader reader, AttributeSource atts, Term term,
final float minSimilarity, final int prefixLength) throws IOException {
if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
throw new IllegalArgumentException("fractional edit distances are not allowed");
@ -116,9 +122,10 @@ public final class FuzzyTermsEnum extends TermsEnum {
}
this.scale_factor = 1.0f / (1.0f - this.minSimilarity);
TermsEnum subEnum = getAutomatonEnum(maxEdits, null);
setEnum(subEnum != null ? subEnum :
new LinearFuzzyTermsEnum());
this.maxBoostAtt = atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
bottomTerm = maxBoostAtt.getCompetitiveTerm();
bottomChanged(null, true);
}
/**
@ -169,19 +176,24 @@ public final class FuzzyTermsEnum extends TermsEnum {
* fired when the max non-competitive boost has changed. this is the hook to
* swap in a smarter actualEnum
*/
private void bottomChanged(float boostValue, BytesRef lastTerm)
private void bottomChanged(BytesRef lastTerm, boolean init)
throws IOException {
int oldMaxEdits = maxEdits;
// true if the last term encountered is lexicographically equal or after the bottom term in the PQ
boolean termAfter = bottomTerm == null || (lastTerm != null && termComparator.compare(lastTerm, bottomTerm) >= 0);
// as long as the max non-competitive boost is >= the max boost
// for some edit distance, keep dropping the max edit distance.
while (maxEdits > 0 && boostValue >= calculateMaxBoost(maxEdits))
while (maxEdits > 0 && (termAfter ? bottom >= calculateMaxBoost(maxEdits) : bottom > calculateMaxBoost(maxEdits)))
maxEdits--;
if (oldMaxEdits != maxEdits) { // the maximum n has changed
if (oldMaxEdits != maxEdits || init) { // the maximum n has changed
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
if (newEnum != null) {
setEnum(newEnum);
} else if (init) {
setEnum(new LinearFuzzyTermsEnum());
}
}
}
@ -202,16 +214,18 @@ public final class FuzzyTermsEnum extends TermsEnum {
@Override
public BytesRef next() throws IOException {
if (queuedBottom != null) {
bottomChanged(bottom, queuedBottom);
bottomChanged(queuedBottom, false);
queuedBottom = null;
}
BytesRef term = actualEnum.next();
boostAtt.setBoost(actualBoostAtt.getBoost());
final float bottom = boostAtt.getMaxNonCompetitiveBoost();
if (bottom != this.bottom && term != null) {
final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm)) {
this.bottom = bottom;
this.bottomTerm = bottomTerm;
// clone the term before potentially doing something with it
// this is a rare but wonderful occurrence anyway
queuedBottom = new BytesRef(term);

View File

@ -19,19 +19,30 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Comparator;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
/**
* An abstract {@link Query} that matches documents
@ -39,7 +50,7 @@ import org.apache.lucene.util.PagedBytes;
* FilteredTermsEnum} enumeration.
*
* <p>This query cannot be used directly; you must subclass
* it and define {@link #getTermsEnum} to provide a {@link
* it and define {@link #getTermsEnum(IndexReader,AttributeSource)} to provide a {@link
* FilteredTermsEnum} that iterates through the terms to be
* matched.
*
@ -71,34 +82,25 @@ public abstract class MultiTermQuery extends Query {
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
transient int numberOfTerms = 0;
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum}
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum(IndexReader,AttributeSource)}
* and update the boost on each returned term. This enables to control the boost factor
* for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or
* {@link TopTermsBooleanQueryRewrite} mode.
* {@link FuzzyQuery} is using this to take the edit distance into account.
* <p><b>Please note:</b> This attribute is intended to be added only by the TermsEnum
* to itsself in its constructor and consumed by the {@link RewriteMethod}.
* @lucene.internal
*/
public static interface BoostAttribute extends Attribute {
/** Sets the boost in this attribute */
public void setBoost(float boost);
/** Retrieves the boost, default is {@code 1.0f}. */
public float getBoost();
/** Sets the maximum boost for terms that would never get
* into the priority queue of {@link MultiTermQuery.TopTermsBooleanQueryRewrite}.
* This value is not changed by {@link AttributeImpl#clear}
* and not used in {@code equals()} and {@code hashCode()}.
* Do not change the value in the {@link TermsEnum}!
*/
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
/** Retrieves the maximum boost that is not competitive,
* default is megative infinity. You can use this boost value
* as a hint when writing the {@link TermsEnum}.
*/
public float getMaxNonCompetitiveBoost();
}
/** Implementation class for {@link BoostAttribute}. */
public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute {
private float boost = 1.0f, maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
private float boost = 1.0f;
public void setBoost(float boost) {
this.boost = boost;
@ -107,14 +109,6 @@ public abstract class MultiTermQuery extends Query {
public float getBoost() {
return boost;
}
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
}
public float getMaxNonCompetitiveBoost() {
return maxNonCompetitiveBoost;
}
@Override
public void clear() {
@ -141,6 +135,83 @@ public abstract class MultiTermQuery extends Query {
}
}
/** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling
* {@link #getTermsEnum(IndexReader,AttributeSource)}.
* {@link FuzzyQuery} is using this to control its internal behaviour
* to only return competitive terms.
* <p><b>Please note:</b> This attribute is intended to be added by the {@link RewriteMethod}
* to an empty {@link AttributeSource} that is shared for all segments
* during query rewrite. This attribute source is passed to all segment enums
* on {@link #getTermsEnum(IndexReader,AttributeSource)}.
* {@link TopTermsBooleanQueryRewrite} uses this attribute to
* inform all enums about the current boost, that is not competitive.
* @lucene.internal
*/
public static interface MaxNonCompetitiveBoostAttribute extends Attribute {
/** This is the maximum boost that would not be competitive. */
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
/** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */
public float getMaxNonCompetitiveBoost();
/** This is the term or <code>null<code> of the term that triggered the boost change. */
public void setCompetitiveTerm(BytesRef competitiveTerm);
/** This is the term or <code>null<code> of the term that triggered the boost change. Default is <code>null</code>, which means every term is competitoive. */
public BytesRef getCompetitiveTerm();
}
/** Implementation class for {@link MaxNonCompetitiveBoostAttribute}. */
public static final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute {
private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
private BytesRef competitiveTerm = null;
public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) {
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
}
public float getMaxNonCompetitiveBoost() {
return maxNonCompetitiveBoost;
}
public void setCompetitiveTerm(final BytesRef competitiveTerm) {
this.competitiveTerm = competitiveTerm;
}
public BytesRef getCompetitiveTerm() {
return competitiveTerm;
}
@Override
public void clear() {
maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
competitiveTerm = null;
}
@Override
public boolean equals(Object other) {
if (this == other)
return true;
if (other instanceof MaxNonCompetitiveBoostAttributeImpl) {
final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other;
return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost)
&& (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm));
}
return false;
}
@Override
public int hashCode() {
int hash = Float.floatToIntBits(maxNonCompetitiveBoost);
if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode();
return hash;
}
@Override
public void copyTo(AttributeImpl target) {
final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target;
t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
t.setCompetitiveTerm(competitiveTerm);
}
}
/** Abstract class that defines how the query is rewritten. */
public static abstract class RewriteMethod implements Serializable {
public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException;
@ -177,69 +248,85 @@ public abstract class MultiTermQuery extends Query {
private abstract static class BooleanQueryRewrite extends RewriteMethod {
protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
final Fields fields = MultiFields.getFields(reader);
if (fields == null) {
// reader has no fields
return 0;
}
final Terms terms = fields.terms(query.field);
if (terms == null) {
// field does not exist
return 0;
}
final TermsEnum termsEnum = query.getTermsEnum(reader);
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY)
return 0;
final BoostAttribute boostAtt =
termsEnum.attributes().addAttribute(BoostAttribute.class);
collector.boostAtt = boostAtt;
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
ReaderUtil.gatherSubReaders(subReaders, reader);
int count = 0;
BytesRef bytes;
while ((bytes = termsEnum.next()) != null) {
if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) {
termsEnum.cacheCurrentTerm();
count++;
} else {
break;
Comparator<BytesRef> lastTermComp = null;
for (IndexReader r : subReaders) {
final Fields fields = r.fields();
if (fields == null) {
// reader has no fields
continue;
}
final Terms terms = fields.terms(query.field);
if (terms == null) {
// field does not exist
continue;
}
final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes);
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY)
continue;
// Check comparator compatibility:
final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
if (lastTermComp != null && newTermComp != lastTermComp)
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
lastTermComp = newTermComp;
collector.setNextEnum(termsEnum);
BytesRef bytes;
while ((bytes = termsEnum.next()) != null) {
if (collector.collect(bytes)) {
termsEnum.cacheCurrentTerm();
count++;
} else {
return count; // interrupt whole term collection, so also don't iterate other subReaders
}
}
}
collector.boostAtt = null;
return count;
}
protected static abstract class TermCollector {
private BoostAttribute boostAtt = null;
/** attributes used for communication with the enum */
public final AttributeSource attributes = new AttributeSource();
/** return false to stop collecting */
public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException;
public abstract boolean collect(BytesRef bytes) throws IOException;
/** set the minimum boost as a hint for the term producer */
protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
assert boostAtt != null;
boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
}
/** the next segment's {@link TermsEnum} that is used to collect terms */
public abstract void setNextEnum(TermsEnum termsEnum) throws IOException;
}
}
private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite {
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final BooleanQuery result = new BooleanQuery(true);
final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
collectTerms(reader, query, col);
final Term placeholderTerm = new Term(query.field);
query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() {
@Override
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
// add new TQ, we must clone the term, else it may get overwritten!
TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq());
tq.setBoost(query.getBoost() * boost); // set the boost
result.add(tq, BooleanClause.Occur.SHOULD); // add to query
return true;
final BooleanQuery result = new BooleanQuery(true);
final int size = col.terms.size();
if (size > 0) {
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
final int[] docFreq = col.array.docFreq;
final float[] boost = col.array.boost;
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
assert reader.docFreq(term) == docFreq[pos];
final TermQuery tq = new TermQuery(term, docFreq[pos]);
tq.setBoost(query.getBoost() * boost[pos]);
result.add(tq, BooleanClause.Occur.SHOULD);
}
}));
}
query.incTotalNumberOfTerms(size);
return result;
}
@ -247,6 +334,75 @@ public abstract class MultiTermQuery extends Query {
protected Object readResolve() {
return SCORING_BOOLEAN_QUERY_REWRITE;
}
static final class ParallelArraysTermCollector extends TermCollector {
final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16);
final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
TermsEnum termsEnum;
private BoostAttribute boostAtt;
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@Override
public boolean collect(BytesRef bytes) {
final int e = terms.add(bytes);
if (e < 0 ) {
// duplicate term: update docFreq
final int pos = (-e)-1;
array.docFreq[pos] += termsEnum.docFreq();
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
} else {
// new entry: we populate the entry initially
array.docFreq[e] = termsEnum.docFreq();
array.boost[e] = boostAtt.getBoost();
}
// if the new entry reaches the max clause count, we exit early
if (e >= BooleanQuery.getMaxClauseCount())
throw new BooleanQuery.TooManyClauses();
return true;
}
}
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
int[] docFreq;
float[] boost;
public TermFreqBoostByteStart(int initSize) {
super(initSize);
}
@Override
public int[] init() {
final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
assert boost.length >= ord.length && docFreq.length >= ord.length;
return ord;
}
@Override
public int[] grow() {
final int[] ord = super.grow();
docFreq = ArrayUtil.grow(docFreq, ord.length);
boost = ArrayUtil.grow(boost, ord.length);
assert boost.length >= ord.length && docFreq.length >= ord.length;
return ord;
}
@Override
public int[] clear() {
boost = null;
docFreq = null;
return super.clear();
}
}
}
/** A rewrite method that first translates each term into
@ -291,44 +447,92 @@ public abstract class MultiTermQuery extends Query {
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
collectTerms(reader, query, new TermCollector() {
private final MaxNonCompetitiveBoostAttribute maxBoostAtt =
attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
private final Map<BytesRef,ScoreTerm> visitedTerms = new HashMap<BytesRef,ScoreTerm>();
private TermsEnum termsEnum;
private Comparator<BytesRef> termComp;
private BoostAttribute boostAtt;
private ScoreTerm st;
@Override
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
this.termComp = termsEnum.getComparator();
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(this.termComp);
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@Override
public boolean collect(BytesRef bytes) {
final float boost = boostAtt.getBoost();
// ignore uncompetetive hits
if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost)
return true;
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes);
st.boost = boost;
st.docFreq = termsEnum.docFreq();
stQueue.offer(st);
// possibly drop entries from queue
st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm();
setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
if (stQueue.size() == maxSize) {
final ScoreTerm t = stQueue.peek();
if (boost < t.boost)
return true;
if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
return true;
}
ScoreTerm t = visitedTerms.get(bytes);
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
t.docFreq += termsEnum.docFreq();
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes);
st.boost = boost;
st.docFreq = termsEnum.docFreq();
visitedTerms.put(st.bytes, st);
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
st = stQueue.poll();
visitedTerms.remove(st.bytes);
} else {
st = new ScoreTerm(termComp);
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
if (stQueue.size() == maxSize) {
t = stQueue.peek();
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
maxBoostAtt.setCompetitiveTerm(t.bytes);
}
}
return true;
}
// reusable instance
private ScoreTerm st = new ScoreTerm();
});
final Term placeholderTerm = new Term(query.field);
final BooleanQuery bq = new BooleanQuery(true);
for (final ScoreTerm st : stQueue) {
// add new query, we must clone the term, else it may get overwritten!
Query tq = getQuery(placeholderTerm.createTerm(st.bytes), st.docFreq);
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
Arrays.sort(scoreTerms, new Comparator<ScoreTerm>() {
public int compare(ScoreTerm st1, ScoreTerm st2) {
assert st1.termComp == st2.termComp :
"term comparator should not change between segments";
return st1.termComp.compare(st1.bytes, st2.bytes);
}
});
for (final ScoreTerm st : scoreTerms) {
final Term term = placeholderTerm.createTerm(st.bytes);
assert reader.docFreq(term) == st.docFreq;
Query tq = getQuery(term, st.docFreq);
tq.setBoost(query.getBoost() * st.boost); // set the boost
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
}
query.incTotalNumberOfTerms(bq.clauses().size());
query.incTotalNumberOfTerms(scoreTerms.length);
return bq;
}
@Override
public int hashCode() {
final int prime = 17;
int result = 1;
result = prime * result + size;
return result;
return 31 * size;
}
@Override
@ -341,15 +545,20 @@ public abstract class MultiTermQuery extends Query {
return true;
}
private static class ScoreTerm implements Comparable<ScoreTerm> {
static final class ScoreTerm implements Comparable<ScoreTerm> {
public final Comparator<BytesRef> termComp;
public final BytesRef bytes = new BytesRef();
public float boost;
public int docFreq;
public ScoreTerm(Comparator<BytesRef> termComp) {
this.termComp = termComp;
}
public int compareTo(ScoreTerm other) {
if (this.boost == other.boost)
// TODO: is it OK to use default compare here?
return other.bytes.compareTo(this.bytes);
return termComp.compare(other.bytes, this.bytes);
else
return Float.compare(this.boost, other.boost);
}
@ -362,8 +571,8 @@ public abstract class MultiTermQuery extends Query {
* scores as computed by the query.
*
* <p>
* This rewrite mode only uses the top scoring terms so it will not overflow
* the boolean max clause count. It is the default rewrite mode for
* This rewrite method only uses the top scoring terms so it will not overflow
* the boolean max clause count. It is the default rewrite method for
* {@link FuzzyQuery}.
*
* @see #setRewriteMethod
@ -510,63 +719,61 @@ public abstract class MultiTermQuery extends Query {
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit);
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
collectTerms(reader, query, col);
final int size = col.pendingTerms.size();
if (col.hasCutOff) {
return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
} else if (col.termCount == 0) {
} else if (size == 0) {
return new BooleanQuery(true);
} else {
final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false);
try {
final BooleanQuery bq = new BooleanQuery(true);
final Term placeholderTerm = new Term(query.field);
long start = col.startOffset;
for(int i = 0; i < col.termCount; i++) {
final BytesRef bytes = new BytesRef();
start = bytesReader.fillUsingLengthPrefix3(bytes, start);
bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD);
}
// Strip scores
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
result.setBoost(query.getBoost());
query.incTotalNumberOfTerms(col.termCount);
return result;
} finally {
bytesReader.close();
final BooleanQuery bq = new BooleanQuery(true);
final Term placeholderTerm = new Term(query.field);
final BytesRefHash pendingTerms = col.pendingTerms;
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
for(int i = 0; i < size; i++) {
// docFreq is not used for constant score here, we pass 1
// to explicitely set a fake value, so it's not calculated
bq.add(new TermQuery(
placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1
), BooleanClause.Occur.SHOULD);
}
// Strip scores
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
result.setBoost(query.getBoost());
query.incTotalNumberOfTerms(size);
return result;
}
}
private static final class CutOffTermCollector extends TermCollector {
CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) {
this.reader = reader;
this.field = field;
static final class CutOffTermCollector extends TermCollector {
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
this.docCountCutoff = docCountCutoff;
this.termCountLimit = termCountLimit;
}
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException {
termCount++;
if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) {
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
pendingTerms.copyUsingLengthPrefix(bytes);
pendingTerms.add(bytes);
docVisitCount += termsEnum.docFreq();
return true;
}
int docVisitCount = 0;
boolean hasCutOff = false;
int termCount = 0;
final IndexReader reader;
final String field;
TermsEnum termsEnum;
final int docCountCutoff, termCountLimit;
final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB
final long startOffset = pendingTerms.getPointer();
final BytesRefHash pendingTerms = new BytesRefHash();
}
@Override
@ -644,8 +851,20 @@ public abstract class MultiTermQuery extends Query {
* field does exist). This method should not return null
* (should instead return {@link TermsEnum#EMPTY} if no
* terms match). The TermsEnum must already be
* positioned to the first matching term. */
protected abstract TermsEnum getTermsEnum(IndexReader reader) throws IOException;
* positioned to the first matching term.
* The given {@link AttributeSource} is passed by the {@link RewriteMethod} to
* provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts.
* This is currently only used by {@link TopTermsBooleanQueryRewrite}
*/
protected abstract TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException;
/** Convenience method, if no attributes are needed:
* This simply passes empty attributes and is equal to:
* <code>getTermsEnum(reader, new AttributeSource())</code>
*/
protected final TermsEnum getTermsEnum(IndexReader reader) throws IOException {
return getTermsEnum(reader, new AttributeSource());
}
/**
* Expert: Return the number of unique terms visited during execution of the query.

View File

@ -26,6 +26,7 @@ import org.apache.lucene.document.NumericField; // for javadocs
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.TermsEnum;
@ -301,7 +302,7 @@ public final class NumericRangeQuery<T extends Number> extends MultiTermQuery {
}
@Override @SuppressWarnings("unchecked")
protected TermsEnum getTermsEnum(final IndexReader reader) throws IOException {
protected TermsEnum getTermsEnum(final IndexReader reader, AttributeSource atts) throws IOException {
// very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are
return (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) ?
TermsEnum.EMPTY :

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
/** A Query that matches documents containing terms with a specified prefix. A PrefixQuery
@ -45,7 +46,7 @@ public class PrefixQuery extends MultiTermQuery {
public Term getPrefix() { return prefix; }
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
if (prefix.bytes().length == 0) {
// no prefix -- match all terms for this field:
// NOTE: for now, MultiTermQuery enums terms at the

View File

@ -33,7 +33,7 @@ import org.apache.lucene.index.IndexReader;
* once per day.
*/
public class QueryWrapperFilter extends Filter {
private Query query;
private final Query query;
/** Constructs a filter which only matches documents matching
* <code>query</code>.
@ -41,6 +41,11 @@ public class QueryWrapperFilter extends Filter {
public QueryWrapperFilter(Query query) {
this.query = query;
}
/** returns the inner Query */
public final Query getQuery() {
return query;
}
@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.AttributeSource;
/**
* A Query that matches documents within an range of terms.
@ -130,7 +131,7 @@ public class TermRangeQuery extends MultiTermQuery {
public Collator getCollator() { return collator; }
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
if (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) {
return TermsEnum.EMPTY;
}

View File

@ -27,14 +27,7 @@ import java.util.Collection;
public final class ArrayUtil {
/**
* @deprecated This constructor was not intended to be public and should not be used.
* This class contains solely a static utility methods.
* It will be made private in Lucene 4.0
*/
// make private in 4.0!
@Deprecated
public ArrayUtil() {} // no instance
private ArrayUtil() {} // no instance
/*
Begin Apache Harmony code
@ -247,6 +240,19 @@ public final class ArrayUtil {
public static short[] grow(short[] array) {
return grow(array, 1 + array.length);
}
public static float[] grow(float[] array, int minSize) {
if (array.length < minSize) {
float[] newArray = new float[oversize(minSize, RamUsageEstimator.NUM_BYTES_FLOAT)];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else
return array;
}
public static float[] grow(float[] array) {
return grow(array, 1 + array.length);
}
public static short[] shrink(short[] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT);

View File

@ -16,8 +16,12 @@ package org.apache.lucene.util;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
import java.util.List;
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
/* Class that Posting and PostingVector use to write byte
/**
* Class that Posting and PostingVector use to write byte
* streams into shared fixed-size byte[] arrays. The idea
* is to allocate slices of increasing lengths For
* example, the first slice is 5 bytes, the next slice is
@ -31,14 +35,10 @@ package org.apache.lucene.util;
* the end with a non-zero byte. This way the methods
* that are writing into the slice don't need to record
* its length and instead allocate a new slice once they
* hit a non-zero byte. */
import java.util.Arrays;
import java.util.List;
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
* hit a non-zero byte.
*
* @lucene.internal
**/
public final class ByteBlockPool {
public final static int BYTE_BLOCK_SHIFT = 15;
public final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
@ -62,6 +62,22 @@ public final class ByteBlockPool {
return new byte[blockSize];
}
}
public static final class DirectAllocator extends Allocator {
public DirectAllocator() {
this(BYTE_BLOCK_SIZE);
}
public DirectAllocator(int blockSize) {
super(blockSize);
}
@Override
public void recycleByteBlocks(byte[][] blocks, int start, int end) {
}
}
public byte[][] buffers = new byte[10][];

View File

@ -17,13 +17,15 @@ package org.apache.lucene.util;
* limitations under the License.
*/
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.util.Arrays;
import java.util.Comparator;
import java.util.concurrent.atomic.AtomicLong;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
/**
* {@link BytesRefHash} is a special purpose hash-map like data-structure
@ -54,6 +56,14 @@ public final class BytesRefHash {
public static final int DEFAULT_CAPACITY = 16;
private final BytesStartArray bytesStartArray;
private AtomicLong bytesUsed;
/**
* Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a
* {@link DirectAllocator}.
*/
public BytesRefHash() {
this(new ByteBlockPool(new DirectAllocator()));
}
/**
* Creates a new {@link BytesRefHash}
@ -75,7 +85,7 @@ public final class BytesRefHash {
Arrays.fill(ords, -1);
this.bytesStartArray = bytesStartArray;
bytesStart = bytesStartArray.init();
bytesUsed = bytesStartArray.bytesUsed();
bytesUsed = bytesStartArray.bytesUsed() == null? new AtomicLong(0) : bytesStartArray.bytesUsed();;
bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT);
}
@ -143,7 +153,6 @@ public final class BytesRefHash {
* the {@link Comparator} used for sorting
*/
public int[] sort(Comparator<BytesRef> comp) {
assert bytesStart != null : "Bytesstart is null - not initialized";
final int[] compact = compact();
quickSort(comp, compact, 0, count - 1);
return compact;
@ -536,13 +545,13 @@ public final class BytesRefHash {
public abstract AtomicLong bytesUsed();
}
static class DirectBytesStartArray extends BytesStartArray {
public static class DirectBytesStartArray extends BytesStartArray {
private final int initSize;
protected final int initSize;
private int[] bytesStart;
private final AtomicLong bytesUsed = new AtomicLong(0);
DirectBytesStartArray(int initSize) {
public DirectBytesStartArray(int initSize) {
this.initSize = initSize;
}

View File

@ -0,0 +1,186 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import java.io.IOException;
public class TestMultiTermQueryRewrites extends LuceneTestCase {
static Directory dir, sdir1, sdir2;
static IndexReader reader, multiReader;
static IndexSearcher searcher, multiSearcher;
@BeforeClass
public static void beforeClass() throws Exception {
dir = newDirectory();
sdir1 = newDirectory();
sdir2 = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer());
final RandomIndexWriter swriter1 = new RandomIndexWriter(random, sdir1, new MockAnalyzer());
final RandomIndexWriter swriter2 = new RandomIndexWriter(random, sdir2, new MockAnalyzer());
for (int i = 0; i < 10; i++) {
Document doc = new Document();
doc.add(newField("data", Integer.toString(i), Field.Store.NO, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
((i % 2 == 0) ? swriter1 : swriter2).addDocument(doc);
}
writer.optimize(); swriter1.optimize(); swriter2.optimize();
writer.close(); swriter1.close(); swriter2.close();
reader = IndexReader.open(dir, true);
searcher = new IndexSearcher(reader);
multiReader = new MultiReader(new IndexReader[] {
IndexReader.open(sdir1, true), IndexReader.open(sdir2, true)
}, true);
multiSearcher = new IndexSearcher(multiReader);
}
@AfterClass
public static void afterClass() throws Exception {
reader.close();
multiReader.close();
dir.close(); sdir1.close(); sdir2.close();
reader = multiReader = null;
searcher = multiSearcher = null;
dir = sdir1 = sdir2 = null;
}
private Query extractInnerQuery(Query q) {
if (q instanceof ConstantScoreQuery) {
// wrapped as ConstantScoreQuery using QueryWrapperFilter
q = ((QueryWrapperFilter) ((ConstantScoreQuery) q).getFilter()).getQuery();
}
return q;
}
private Term extractTerm(Query q) {
q = extractInnerQuery(q);
return ((TermQuery) q).getTerm();
}
private void checkBooleanQueryOrder(Query q) {
q = extractInnerQuery(q);
final BooleanQuery bq = (BooleanQuery) q;
Term last = null, act;
for (BooleanClause clause : bq.clauses()) {
act = extractTerm(clause.getQuery());
if (last != null) {
assertTrue("sort order of terms in BQ violated", last.compareTo(act) < 0);
}
last = act;
}
}
private void checkDuplicateTerms(MultiTermQuery.RewriteMethod method) throws Exception {
final MultiTermQuery mtq = new TermRangeQuery("data", "2", "7", true, true);
mtq.setRewriteMethod(method);
final Query q1 = searcher.rewrite(mtq);
final Query q2 = multiSearcher.rewrite(mtq);
if (VERBOSE) {
System.out.println();
System.out.println("single segment: " + q1);
System.out.println(" multi segment: " + q2);
}
assertEquals("The multi-segment case must produce same rewritten query", q1, q2);
checkBooleanQueryOrder(q1);
checkBooleanQueryOrder(q2);
}
public void testRewritesWithDuplicateTerms() throws Exception {
checkDuplicateTerms(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
checkDuplicateTerms(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
// use a large PQ here to only test duplicate terms and dont mix up when all scores are equal
checkDuplicateTerms(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024));
checkDuplicateTerms(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(1024));
// Test auto rewrite (but only boolean mode), so we set the limits to large values to always get a BQ
final MultiTermQuery.ConstantScoreAutoRewrite rewrite = new MultiTermQuery.ConstantScoreAutoRewrite();
rewrite.setTermCountCutoff(Integer.MAX_VALUE);
rewrite.setDocCountPercent(100.);
checkDuplicateTerms(rewrite);
}
private void checkBooleanQueryBoosts(BooleanQuery bq) {
for (BooleanClause clause : bq.clauses()) {
final TermQuery mtq = (TermQuery) clause.getQuery();
assertEquals("Parallel sorting of boosts in rewrite mode broken",
Float.parseFloat(mtq.getTerm().text()), mtq.getBoost());
}
}
private void checkBoosts(MultiTermQuery.RewriteMethod method) throws Exception {
final MultiTermQuery mtq = new MultiTermQuery("data") {
@Override
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
return new TermRangeTermsEnum(reader, field, "2", "7", true, true, null) {
final MultiTermQuery.BoostAttribute boostAtt =
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
@Override
protected AcceptStatus accept(BytesRef term) {
boostAtt.setBoost(Float.parseFloat(term.utf8ToString()));
return super.accept(term);
}
};
}
@Override
public String toString(String field) {
return "dummy";
}
};
mtq.setRewriteMethod(method);
final Query q1 = searcher.rewrite(mtq);
final Query q2 = multiSearcher.rewrite(mtq);
if (VERBOSE) {
System.out.println();
System.out.println("single segment: " + q1);
System.out.println(" multi segment: " + q2);
}
assertEquals("The multi-segment case must produce same rewritten query", q1, q2);
checkBooleanQueryBoosts((BooleanQuery) q1);
checkBooleanQueryBoosts((BooleanQuery) q2);
}
public void testBoosts() throws Exception {
checkBoosts(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
// use a large PQ here to only test boosts and dont mix up when all scores are equal
checkBoosts(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024));
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@ -85,7 +86,7 @@ public class TestPrefixRandom extends LuceneTestCase {
}
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
return new SimplePrefixTermsEnum(reader, field, prefix);
}

View File

@ -36,6 +36,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@ -103,7 +104,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
}
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
return new SimpleAutomatonTermsEnum(reader, field);
}