mirror of https://github.com/apache/lucene.git
LUCENE-2690: MultiTermQuery boolean rewrites per segment
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1022934 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2a8c9dfa3f
commit
dd1c7a8585
|
@ -147,8 +147,10 @@ API Changes
|
|||
you also override this method on upgrade. (Robert Muir, Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-2691: IndexWriter.getReader() has been made package local and is now exposed via open and reopen methods on
|
||||
IndexReader. The semantics of the call is the same as it was prior to the API change. (Grant Ingersoll, Mike McCandless)
|
||||
* LUCENE-2691: IndexWriter.getReader() has been made package local and is now
|
||||
exposed via open and reopen methods on IndexReader. The semantics of the
|
||||
call is the same as it was prior to the API change.
|
||||
(Grant Ingersoll, Mike McCandless)
|
||||
|
||||
New features
|
||||
|
||||
|
@ -265,6 +267,9 @@ New features
|
|||
* LUCENE-2692: Added several new SpanQuery classes for positional checking
|
||||
(match is in a range, payload is a specific value) (Grant Ingersoll)
|
||||
|
||||
* LUCENE-2690: MultiTermQuery boolean rewrites per segment.
|
||||
(Uwe Schindler, Robert Muir, Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.
|
||||
|
|
|
@ -316,3 +316,11 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
|
|||
* LUCENE-2691: The near-real-time API has moved from IndexWriter to
|
||||
IndexReader. Instead of IndexWriter.getReader(), call
|
||||
IndexReader.open(IndexWriter) or IndexReader.reopen(IndexWriter).
|
||||
|
||||
* LUCENE-2690: MultiTermQuery boolean rewrites per segment.
|
||||
Also MultiTermQuery.getTermsEnum() now takes an AttributeSource. FuzzyTermsEnum
|
||||
is both consumer and producer of attributes: MTQ.BoostAttribute is
|
||||
added to the FuzzyTermsEnum and MTQ's rewrite mode consumes it.
|
||||
The other way round MTQ.TopTermsBooleanQueryRewrite supplys a
|
||||
global AttributeSource to each segments TermsEnum. The TermsEnum is consumer
|
||||
and gets the current minimum competitive boosts (MTQ.MaxNonCompetitiveBoostAttribute).
|
||||
|
|
|
@ -886,7 +886,7 @@ public class MemoryIndex implements Serializable {
|
|||
|
||||
@Override
|
||||
public int docFreq() {
|
||||
return info.sortedTerms[termUpto].getValue().size();
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
|
@ -199,7 +200,10 @@ public class FuzzyLikeThisQuery extends Query
|
|||
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
|
||||
float minScore=0;
|
||||
Term startTerm=internSavingTemplateTerm.createTerm(term);
|
||||
FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
|
||||
AttributeSource atts = new AttributeSource();
|
||||
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
|
||||
FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, atts, startTerm, f.minSimilarity, f.prefixLength);
|
||||
//store the df so all variants use same idf
|
||||
int df = reader.docFreq(startTerm);
|
||||
int numVariants=0;
|
||||
|
@ -217,7 +221,7 @@ public class FuzzyLikeThisQuery extends Query
|
|||
variantsQ.insertWithOverflow(st);
|
||||
minScore = variantsQ.top().score; // maintain minScore
|
||||
}
|
||||
boostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.search.MultiTermQuery;
|
|||
import org.apache.lucene.search.FilteredTermsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -60,7 +61,7 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
protected FilteredTermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
return new RegexTermsEnum(reader, term, regexImpl);
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.index.TermsEnum;
|
|||
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestRegexQuery extends LuceneTestCase {
|
||||
|
@ -78,7 +79,7 @@ public class TestRegexQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testMatchAll() throws Exception {
|
||||
TermsEnum terms = new RegexQuery(new Term(FN, "jum.")).getTermsEnum(searcher.getIndexReader());
|
||||
TermsEnum terms = new RegexQuery(new Term(FN, "jum.")).getTermsEnum(searcher.getIndexReader(), new AttributeSource() /*dummy*/);
|
||||
// no term should match
|
||||
assertNull(terms.next());
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FuzzyTermsEnum;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
|
||||
|
@ -387,7 +388,10 @@ public class DirectSpellChecker {
|
|||
private Collection<ScoreTerm> suggestSimilar(Term term, int numSug,
|
||||
IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
|
||||
|
||||
FuzzyTermsEnum e = new FuzzyTermsEnum(ir, term, editDistance, Math.max(minPrefix, editDistance-1));
|
||||
AttributeSource atts = new AttributeSource();
|
||||
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
|
||||
FuzzyTermsEnum e = new FuzzyTermsEnum(ir, atts, term, editDistance, Math.max(minPrefix, editDistance-1));
|
||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||
|
||||
BytesRef queryTerm = new BytesRef(term.text());
|
||||
|
@ -435,7 +439,7 @@ public class DirectSpellChecker {
|
|||
stQueue.offer(st);
|
||||
// possibly drop entries from queue
|
||||
st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
|
||||
boostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
return stQueue;
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
|
@ -85,7 +86,7 @@ public class AutomatonQuery extends MultiTermQuery {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
// matches nothing
|
||||
if (BasicOperations.isEmpty(automaton)) {
|
||||
return TermsEnum.EMPTY;
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
|
||||
|
@ -135,11 +136,11 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
if (!termLongEnough) { // can only match if it's exact
|
||||
return new SingleTermsEnum(reader, term);
|
||||
}
|
||||
return new FuzzyTermsEnum(reader, getTerm(), minimumSimilarity, prefixLength);
|
||||
return new FuzzyTermsEnum(reader, atts, getTerm(), minimumSimilarity, prefixLength);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
@ -51,7 +52,12 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
private final MultiTermQuery.BoostAttribute boostAtt =
|
||||
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
|
||||
private float bottom = boostAtt.getMaxNonCompetitiveBoost();
|
||||
private final MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt;
|
||||
|
||||
private float bottom;
|
||||
private BytesRef bottomTerm;
|
||||
// nocommit: chicken-and-egg
|
||||
private final Comparator<BytesRef> termComparator = BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
|
||||
private final float minSimilarity;
|
||||
private final float scale_factor;
|
||||
|
@ -82,7 +88,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
* @param prefixLength Length of required common prefix. Default value is 0.
|
||||
* @throws IOException
|
||||
*/
|
||||
public FuzzyTermsEnum(IndexReader reader, Term term,
|
||||
public FuzzyTermsEnum(IndexReader reader, AttributeSource atts, Term term,
|
||||
final float minSimilarity, final int prefixLength) throws IOException {
|
||||
if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
|
||||
throw new IllegalArgumentException("fractional edit distances are not allowed");
|
||||
|
@ -116,9 +122,10 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
}
|
||||
this.scale_factor = 1.0f / (1.0f - this.minSimilarity);
|
||||
|
||||
TermsEnum subEnum = getAutomatonEnum(maxEdits, null);
|
||||
setEnum(subEnum != null ? subEnum :
|
||||
new LinearFuzzyTermsEnum());
|
||||
this.maxBoostAtt = atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
|
||||
bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
|
||||
bottomTerm = maxBoostAtt.getCompetitiveTerm();
|
||||
bottomChanged(null, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -169,19 +176,24 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
* fired when the max non-competitive boost has changed. this is the hook to
|
||||
* swap in a smarter actualEnum
|
||||
*/
|
||||
private void bottomChanged(float boostValue, BytesRef lastTerm)
|
||||
private void bottomChanged(BytesRef lastTerm, boolean init)
|
||||
throws IOException {
|
||||
int oldMaxEdits = maxEdits;
|
||||
|
||||
// true if the last term encountered is lexicographically equal or after the bottom term in the PQ
|
||||
boolean termAfter = bottomTerm == null || (lastTerm != null && termComparator.compare(lastTerm, bottomTerm) >= 0);
|
||||
|
||||
// as long as the max non-competitive boost is >= the max boost
|
||||
// for some edit distance, keep dropping the max edit distance.
|
||||
while (maxEdits > 0 && boostValue >= calculateMaxBoost(maxEdits))
|
||||
while (maxEdits > 0 && (termAfter ? bottom >= calculateMaxBoost(maxEdits) : bottom > calculateMaxBoost(maxEdits)))
|
||||
maxEdits--;
|
||||
|
||||
if (oldMaxEdits != maxEdits) { // the maximum n has changed
|
||||
if (oldMaxEdits != maxEdits || init) { // the maximum n has changed
|
||||
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
|
||||
if (newEnum != null) {
|
||||
setEnum(newEnum);
|
||||
} else if (init) {
|
||||
setEnum(new LinearFuzzyTermsEnum());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -202,16 +214,18 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
if (queuedBottom != null) {
|
||||
bottomChanged(bottom, queuedBottom);
|
||||
bottomChanged(queuedBottom, false);
|
||||
queuedBottom = null;
|
||||
}
|
||||
|
||||
BytesRef term = actualEnum.next();
|
||||
boostAtt.setBoost(actualBoostAtt.getBoost());
|
||||
|
||||
final float bottom = boostAtt.getMaxNonCompetitiveBoost();
|
||||
if (bottom != this.bottom && term != null) {
|
||||
final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
|
||||
final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
|
||||
if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm)) {
|
||||
this.bottom = bottom;
|
||||
this.bottomTerm = bottomTerm;
|
||||
// clone the term before potentially doing something with it
|
||||
// this is a rare but wonderful occurrence anyway
|
||||
queuedBottom = new BytesRef(term);
|
||||
|
|
|
@ -19,19 +19,30 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.queryParser.QueryParser; // for javadoc
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.PagedBytes;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
|
||||
|
||||
/**
|
||||
* An abstract {@link Query} that matches documents
|
||||
|
@ -39,7 +50,7 @@ import org.apache.lucene.util.PagedBytes;
|
|||
* FilteredTermsEnum} enumeration.
|
||||
*
|
||||
* <p>This query cannot be used directly; you must subclass
|
||||
* it and define {@link #getTermsEnum} to provide a {@link
|
||||
* it and define {@link #getTermsEnum(IndexReader,AttributeSource)} to provide a {@link
|
||||
* FilteredTermsEnum} that iterates through the terms to be
|
||||
* matched.
|
||||
*
|
||||
|
@ -71,34 +82,25 @@ public abstract class MultiTermQuery extends Query {
|
|||
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
|
||||
transient int numberOfTerms = 0;
|
||||
|
||||
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum}
|
||||
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum(IndexReader,AttributeSource)}
|
||||
* and update the boost on each returned term. This enables to control the boost factor
|
||||
* for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or
|
||||
* {@link TopTermsBooleanQueryRewrite} mode.
|
||||
* {@link FuzzyQuery} is using this to take the edit distance into account.
|
||||
* <p><b>Please note:</b> This attribute is intended to be added only by the TermsEnum
|
||||
* to itsself in its constructor and consumed by the {@link RewriteMethod}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static interface BoostAttribute extends Attribute {
|
||||
/** Sets the boost in this attribute */
|
||||
public void setBoost(float boost);
|
||||
/** Retrieves the boost, default is {@code 1.0f}. */
|
||||
public float getBoost();
|
||||
/** Sets the maximum boost for terms that would never get
|
||||
* into the priority queue of {@link MultiTermQuery.TopTermsBooleanQueryRewrite}.
|
||||
* This value is not changed by {@link AttributeImpl#clear}
|
||||
* and not used in {@code equals()} and {@code hashCode()}.
|
||||
* Do not change the value in the {@link TermsEnum}!
|
||||
*/
|
||||
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
|
||||
/** Retrieves the maximum boost that is not competitive,
|
||||
* default is megative infinity. You can use this boost value
|
||||
* as a hint when writing the {@link TermsEnum}.
|
||||
*/
|
||||
public float getMaxNonCompetitiveBoost();
|
||||
}
|
||||
|
||||
/** Implementation class for {@link BoostAttribute}. */
|
||||
public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute {
|
||||
private float boost = 1.0f, maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
|
||||
private float boost = 1.0f;
|
||||
|
||||
public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
|
@ -107,14 +109,6 @@ public abstract class MultiTermQuery extends Query {
|
|||
public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
|
||||
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
public float getMaxNonCompetitiveBoost() {
|
||||
return maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
|
@ -141,6 +135,83 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
}
|
||||
|
||||
/** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling
|
||||
* {@link #getTermsEnum(IndexReader,AttributeSource)}.
|
||||
* {@link FuzzyQuery} is using this to control its internal behaviour
|
||||
* to only return competitive terms.
|
||||
* <p><b>Please note:</b> This attribute is intended to be added by the {@link RewriteMethod}
|
||||
* to an empty {@link AttributeSource} that is shared for all segments
|
||||
* during query rewrite. This attribute source is passed to all segment enums
|
||||
* on {@link #getTermsEnum(IndexReader,AttributeSource)}.
|
||||
* {@link TopTermsBooleanQueryRewrite} uses this attribute to
|
||||
* inform all enums about the current boost, that is not competitive.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static interface MaxNonCompetitiveBoostAttribute extends Attribute {
|
||||
/** This is the maximum boost that would not be competitive. */
|
||||
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
|
||||
/** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */
|
||||
public float getMaxNonCompetitiveBoost();
|
||||
/** This is the term or <code>null<code> of the term that triggered the boost change. */
|
||||
public void setCompetitiveTerm(BytesRef competitiveTerm);
|
||||
/** This is the term or <code>null<code> of the term that triggered the boost change. Default is <code>null</code>, which means every term is competitoive. */
|
||||
public BytesRef getCompetitiveTerm();
|
||||
}
|
||||
|
||||
/** Implementation class for {@link MaxNonCompetitiveBoostAttribute}. */
|
||||
public static final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute {
|
||||
private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
|
||||
private BytesRef competitiveTerm = null;
|
||||
|
||||
public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) {
|
||||
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
public float getMaxNonCompetitiveBoost() {
|
||||
return maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
public void setCompetitiveTerm(final BytesRef competitiveTerm) {
|
||||
this.competitiveTerm = competitiveTerm;
|
||||
}
|
||||
|
||||
public BytesRef getCompetitiveTerm() {
|
||||
return competitiveTerm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
|
||||
competitiveTerm = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other)
|
||||
return true;
|
||||
if (other instanceof MaxNonCompetitiveBoostAttributeImpl) {
|
||||
final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other;
|
||||
return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost)
|
||||
&& (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = Float.floatToIntBits(maxNonCompetitiveBoost);
|
||||
if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target;
|
||||
t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
|
||||
t.setCompetitiveTerm(competitiveTerm);
|
||||
}
|
||||
}
|
||||
|
||||
/** Abstract class that defines how the query is rewritten. */
|
||||
public static abstract class RewriteMethod implements Serializable {
|
||||
public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException;
|
||||
|
@ -177,69 +248,85 @@ public abstract class MultiTermQuery extends Query {
|
|||
private abstract static class BooleanQueryRewrite extends RewriteMethod {
|
||||
|
||||
protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
|
||||
final Fields fields = MultiFields.getFields(reader);
|
||||
if (fields == null) {
|
||||
// reader has no fields
|
||||
return 0;
|
||||
}
|
||||
|
||||
final Terms terms = fields.terms(query.field);
|
||||
if (terms == null) {
|
||||
// field does not exist
|
||||
return 0;
|
||||
}
|
||||
|
||||
final TermsEnum termsEnum = query.getTermsEnum(reader);
|
||||
assert termsEnum != null;
|
||||
|
||||
if (termsEnum == TermsEnum.EMPTY)
|
||||
return 0;
|
||||
final BoostAttribute boostAtt =
|
||||
termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
collector.boostAtt = boostAtt;
|
||||
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
|
||||
ReaderUtil.gatherSubReaders(subReaders, reader);
|
||||
int count = 0;
|
||||
BytesRef bytes;
|
||||
while ((bytes = termsEnum.next()) != null) {
|
||||
if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) {
|
||||
termsEnum.cacheCurrentTerm();
|
||||
count++;
|
||||
} else {
|
||||
break;
|
||||
Comparator<BytesRef> lastTermComp = null;
|
||||
|
||||
for (IndexReader r : subReaders) {
|
||||
final Fields fields = r.fields();
|
||||
if (fields == null) {
|
||||
// reader has no fields
|
||||
continue;
|
||||
}
|
||||
|
||||
final Terms terms = fields.terms(query.field);
|
||||
if (terms == null) {
|
||||
// field does not exist
|
||||
continue;
|
||||
}
|
||||
|
||||
final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes);
|
||||
assert termsEnum != null;
|
||||
|
||||
if (termsEnum == TermsEnum.EMPTY)
|
||||
continue;
|
||||
|
||||
// Check comparator compatibility:
|
||||
final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
|
||||
if (lastTermComp != null && newTermComp != lastTermComp)
|
||||
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
|
||||
lastTermComp = newTermComp;
|
||||
|
||||
collector.setNextEnum(termsEnum);
|
||||
BytesRef bytes;
|
||||
while ((bytes = termsEnum.next()) != null) {
|
||||
if (collector.collect(bytes)) {
|
||||
termsEnum.cacheCurrentTerm();
|
||||
count++;
|
||||
} else {
|
||||
return count; // interrupt whole term collection, so also don't iterate other subReaders
|
||||
}
|
||||
}
|
||||
}
|
||||
collector.boostAtt = null;
|
||||
return count;
|
||||
}
|
||||
|
||||
protected static abstract class TermCollector {
|
||||
private BoostAttribute boostAtt = null;
|
||||
/** attributes used for communication with the enum */
|
||||
public final AttributeSource attributes = new AttributeSource();
|
||||
|
||||
/** return false to stop collecting */
|
||||
public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException;
|
||||
public abstract boolean collect(BytesRef bytes) throws IOException;
|
||||
|
||||
/** set the minimum boost as a hint for the term producer */
|
||||
protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
|
||||
assert boostAtt != null;
|
||||
boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
|
||||
}
|
||||
/** the next segment's {@link TermsEnum} that is used to collect terms */
|
||||
public abstract void setNextEnum(TermsEnum termsEnum) throws IOException;
|
||||
}
|
||||
}
|
||||
|
||||
private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite {
|
||||
@Override
|
||||
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
final BooleanQuery result = new BooleanQuery(true);
|
||||
final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
|
||||
collectTerms(reader, query, col);
|
||||
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() {
|
||||
@Override
|
||||
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
|
||||
// add new TQ, we must clone the term, else it may get overwritten!
|
||||
TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq());
|
||||
tq.setBoost(query.getBoost() * boost); // set the boost
|
||||
result.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||
return true;
|
||||
final BooleanQuery result = new BooleanQuery(true);
|
||||
final int size = col.terms.size();
|
||||
if (size > 0) {
|
||||
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
|
||||
final int[] docFreq = col.array.docFreq;
|
||||
final float[] boost = col.array.boost;
|
||||
for (int i = 0; i < size; i++) {
|
||||
final int pos = sort[i];
|
||||
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
|
||||
assert reader.docFreq(term) == docFreq[pos];
|
||||
final TermQuery tq = new TermQuery(term, docFreq[pos]);
|
||||
tq.setBoost(query.getBoost() * boost[pos]);
|
||||
result.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
}));
|
||||
}
|
||||
query.incTotalNumberOfTerms(size);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -247,6 +334,75 @@ public abstract class MultiTermQuery extends Query {
|
|||
protected Object readResolve() {
|
||||
return SCORING_BOOLEAN_QUERY_REWRITE;
|
||||
}
|
||||
|
||||
static final class ParallelArraysTermCollector extends TermCollector {
|
||||
final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16);
|
||||
final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
|
||||
TermsEnum termsEnum;
|
||||
|
||||
private BoostAttribute boostAtt;
|
||||
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) {
|
||||
final int e = terms.add(bytes);
|
||||
if (e < 0 ) {
|
||||
// duplicate term: update docFreq
|
||||
final int pos = (-e)-1;
|
||||
array.docFreq[pos] += termsEnum.docFreq();
|
||||
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
|
||||
} else {
|
||||
// new entry: we populate the entry initially
|
||||
array.docFreq[e] = termsEnum.docFreq();
|
||||
array.boost[e] = boostAtt.getBoost();
|
||||
}
|
||||
// if the new entry reaches the max clause count, we exit early
|
||||
if (e >= BooleanQuery.getMaxClauseCount())
|
||||
throw new BooleanQuery.TooManyClauses();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
|
||||
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
|
||||
int[] docFreq;
|
||||
float[] boost;
|
||||
|
||||
public TermFreqBoostByteStart(int initSize) {
|
||||
super(initSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] init() {
|
||||
final int[] ord = super.init();
|
||||
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
|
||||
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] grow() {
|
||||
final int[] ord = super.grow();
|
||||
docFreq = ArrayUtil.grow(docFreq, ord.length);
|
||||
boost = ArrayUtil.grow(boost, ord.length);
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] clear() {
|
||||
boost = null;
|
||||
docFreq = null;
|
||||
return super.clear();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/** A rewrite method that first translates each term into
|
||||
|
@ -291,44 +447,92 @@ public abstract class MultiTermQuery extends Query {
|
|||
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
|
||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||
collectTerms(reader, query, new TermCollector() {
|
||||
private final MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||
|
||||
private final Map<BytesRef,ScoreTerm> visitedTerms = new HashMap<BytesRef,ScoreTerm>();
|
||||
|
||||
private TermsEnum termsEnum;
|
||||
private Comparator<BytesRef> termComp;
|
||||
private BoostAttribute boostAtt;
|
||||
private ScoreTerm st;
|
||||
|
||||
@Override
|
||||
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) {
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
this.termComp = termsEnum.getComparator();
|
||||
// lazy init the initial ScoreTerm because comparator is not known on ctor:
|
||||
if (st == null)
|
||||
st = new ScoreTerm(this.termComp);
|
||||
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) {
|
||||
final float boost = boostAtt.getBoost();
|
||||
// ignore uncompetetive hits
|
||||
if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost)
|
||||
return true;
|
||||
// add new entry in PQ, we must clone the term, else it may get overwritten!
|
||||
st.bytes.copy(bytes);
|
||||
st.boost = boost;
|
||||
st.docFreq = termsEnum.docFreq();
|
||||
stQueue.offer(st);
|
||||
// possibly drop entries from queue
|
||||
st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm();
|
||||
setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
|
||||
if (stQueue.size() == maxSize) {
|
||||
final ScoreTerm t = stQueue.peek();
|
||||
if (boost < t.boost)
|
||||
return true;
|
||||
if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
|
||||
return true;
|
||||
}
|
||||
ScoreTerm t = visitedTerms.get(bytes);
|
||||
if (t != null) {
|
||||
// if the term is already in the PQ, only update docFreq of term in PQ
|
||||
t.docFreq += termsEnum.docFreq();
|
||||
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
|
||||
} else {
|
||||
// add new entry in PQ, we must clone the term, else it may get overwritten!
|
||||
st.bytes.copy(bytes);
|
||||
st.boost = boost;
|
||||
st.docFreq = termsEnum.docFreq();
|
||||
visitedTerms.put(st.bytes, st);
|
||||
stQueue.offer(st);
|
||||
// possibly drop entries from queue
|
||||
if (stQueue.size() > maxSize) {
|
||||
st = stQueue.poll();
|
||||
visitedTerms.remove(st.bytes);
|
||||
} else {
|
||||
st = new ScoreTerm(termComp);
|
||||
}
|
||||
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
|
||||
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
|
||||
if (stQueue.size() == maxSize) {
|
||||
t = stQueue.peek();
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
|
||||
maxBoostAtt.setCompetitiveTerm(t.bytes);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// reusable instance
|
||||
private ScoreTerm st = new ScoreTerm();
|
||||
});
|
||||
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final BooleanQuery bq = new BooleanQuery(true);
|
||||
for (final ScoreTerm st : stQueue) {
|
||||
// add new query, we must clone the term, else it may get overwritten!
|
||||
Query tq = getQuery(placeholderTerm.createTerm(st.bytes), st.docFreq);
|
||||
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
|
||||
Arrays.sort(scoreTerms, new Comparator<ScoreTerm>() {
|
||||
public int compare(ScoreTerm st1, ScoreTerm st2) {
|
||||
assert st1.termComp == st2.termComp :
|
||||
"term comparator should not change between segments";
|
||||
return st1.termComp.compare(st1.bytes, st2.bytes);
|
||||
}
|
||||
});
|
||||
for (final ScoreTerm st : scoreTerms) {
|
||||
final Term term = placeholderTerm.createTerm(st.bytes);
|
||||
assert reader.docFreq(term) == st.docFreq;
|
||||
Query tq = getQuery(term, st.docFreq);
|
||||
tq.setBoost(query.getBoost() * st.boost); // set the boost
|
||||
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||
}
|
||||
query.incTotalNumberOfTerms(bq.clauses().size());
|
||||
query.incTotalNumberOfTerms(scoreTerms.length);
|
||||
return bq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 17;
|
||||
int result = 1;
|
||||
result = prime * result + size;
|
||||
return result;
|
||||
return 31 * size;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -341,15 +545,20 @@ public abstract class MultiTermQuery extends Query {
|
|||
return true;
|
||||
}
|
||||
|
||||
private static class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
static final class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
public final Comparator<BytesRef> termComp;
|
||||
|
||||
public final BytesRef bytes = new BytesRef();
|
||||
public float boost;
|
||||
public int docFreq;
|
||||
|
||||
public ScoreTerm(Comparator<BytesRef> termComp) {
|
||||
this.termComp = termComp;
|
||||
}
|
||||
|
||||
public int compareTo(ScoreTerm other) {
|
||||
if (this.boost == other.boost)
|
||||
// TODO: is it OK to use default compare here?
|
||||
return other.bytes.compareTo(this.bytes);
|
||||
return termComp.compare(other.bytes, this.bytes);
|
||||
else
|
||||
return Float.compare(this.boost, other.boost);
|
||||
}
|
||||
|
@ -362,8 +571,8 @@ public abstract class MultiTermQuery extends Query {
|
|||
* scores as computed by the query.
|
||||
*
|
||||
* <p>
|
||||
* This rewrite mode only uses the top scoring terms so it will not overflow
|
||||
* the boolean max clause count. It is the default rewrite mode for
|
||||
* This rewrite method only uses the top scoring terms so it will not overflow
|
||||
* the boolean max clause count. It is the default rewrite method for
|
||||
* {@link FuzzyQuery}.
|
||||
*
|
||||
* @see #setRewriteMethod
|
||||
|
@ -510,63 +719,61 @@ public abstract class MultiTermQuery extends Query {
|
|||
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
|
||||
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
|
||||
|
||||
final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit);
|
||||
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
|
||||
collectTerms(reader, query, col);
|
||||
|
||||
final int size = col.pendingTerms.size();
|
||||
if (col.hasCutOff) {
|
||||
return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
|
||||
} else if (col.termCount == 0) {
|
||||
} else if (size == 0) {
|
||||
return new BooleanQuery(true);
|
||||
} else {
|
||||
final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false);
|
||||
try {
|
||||
final BooleanQuery bq = new BooleanQuery(true);
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
long start = col.startOffset;
|
||||
for(int i = 0; i < col.termCount; i++) {
|
||||
final BytesRef bytes = new BytesRef();
|
||||
start = bytesReader.fillUsingLengthPrefix3(bytes, start);
|
||||
bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
// Strip scores
|
||||
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
|
||||
result.setBoost(query.getBoost());
|
||||
query.incTotalNumberOfTerms(col.termCount);
|
||||
return result;
|
||||
} finally {
|
||||
bytesReader.close();
|
||||
final BooleanQuery bq = new BooleanQuery(true);
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final BytesRefHash pendingTerms = col.pendingTerms;
|
||||
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
|
||||
for(int i = 0; i < size; i++) {
|
||||
// docFreq is not used for constant score here, we pass 1
|
||||
// to explicitely set a fake value, so it's not calculated
|
||||
bq.add(new TermQuery(
|
||||
placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1
|
||||
), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
// Strip scores
|
||||
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
|
||||
result.setBoost(query.getBoost());
|
||||
query.incTotalNumberOfTerms(size);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class CutOffTermCollector extends TermCollector {
|
||||
CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) {
|
||||
this.reader = reader;
|
||||
this.field = field;
|
||||
static final class CutOffTermCollector extends TermCollector {
|
||||
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
|
||||
this.docCountCutoff = docCountCutoff;
|
||||
this.termCountLimit = termCountLimit;
|
||||
}
|
||||
|
||||
public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException {
|
||||
termCount++;
|
||||
if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) {
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) throws IOException {
|
||||
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
|
||||
hasCutOff = true;
|
||||
return false;
|
||||
}
|
||||
pendingTerms.copyUsingLengthPrefix(bytes);
|
||||
pendingTerms.add(bytes);
|
||||
docVisitCount += termsEnum.docFreq();
|
||||
return true;
|
||||
}
|
||||
|
||||
int docVisitCount = 0;
|
||||
boolean hasCutOff = false;
|
||||
int termCount = 0;
|
||||
|
||||
final IndexReader reader;
|
||||
final String field;
|
||||
TermsEnum termsEnum;
|
||||
|
||||
final int docCountCutoff, termCountLimit;
|
||||
final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB
|
||||
final long startOffset = pendingTerms.getPointer();
|
||||
final BytesRefHash pendingTerms = new BytesRefHash();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -644,8 +851,20 @@ public abstract class MultiTermQuery extends Query {
|
|||
* field does exist). This method should not return null
|
||||
* (should instead return {@link TermsEnum#EMPTY} if no
|
||||
* terms match). The TermsEnum must already be
|
||||
* positioned to the first matching term. */
|
||||
protected abstract TermsEnum getTermsEnum(IndexReader reader) throws IOException;
|
||||
* positioned to the first matching term.
|
||||
* The given {@link AttributeSource} is passed by the {@link RewriteMethod} to
|
||||
* provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts.
|
||||
* This is currently only used by {@link TopTermsBooleanQueryRewrite}
|
||||
*/
|
||||
protected abstract TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException;
|
||||
|
||||
/** Convenience method, if no attributes are needed:
|
||||
* This simply passes empty attributes and is equal to:
|
||||
* <code>getTermsEnum(reader, new AttributeSource())</code>
|
||||
*/
|
||||
protected final TermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
return getTermsEnum(reader, new AttributeSource());
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Return the number of unique terms visited during execution of the query.
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.document.NumericField; // for javadocs
|
|||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
|
||||
|
@ -301,7 +302,7 @@ public final class NumericRangeQuery<T extends Number> extends MultiTermQuery {
|
|||
}
|
||||
|
||||
@Override @SuppressWarnings("unchecked")
|
||||
protected TermsEnum getTermsEnum(final IndexReader reader) throws IOException {
|
||||
protected TermsEnum getTermsEnum(final IndexReader reader, AttributeSource atts) throws IOException {
|
||||
// very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are
|
||||
return (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) ?
|
||||
TermsEnum.EMPTY :
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
/** A Query that matches documents containing terms with a specified prefix. A PrefixQuery
|
||||
|
@ -45,7 +46,7 @@ public class PrefixQuery extends MultiTermQuery {
|
|||
public Term getPrefix() { return prefix; }
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
if (prefix.bytes().length == 0) {
|
||||
// no prefix -- match all terms for this field:
|
||||
// NOTE: for now, MultiTermQuery enums terms at the
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
* once per day.
|
||||
*/
|
||||
public class QueryWrapperFilter extends Filter {
|
||||
private Query query;
|
||||
private final Query query;
|
||||
|
||||
/** Constructs a filter which only matches documents matching
|
||||
* <code>query</code>.
|
||||
|
@ -41,6 +41,11 @@ public class QueryWrapperFilter extends Filter {
|
|||
public QueryWrapperFilter(Query query) {
|
||||
this.query = query;
|
||||
}
|
||||
|
||||
/** returns the inner Query */
|
||||
public final Query getQuery() {
|
||||
return query;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* A Query that matches documents within an range of terms.
|
||||
|
@ -130,7 +131,7 @@ public class TermRangeQuery extends MultiTermQuery {
|
|||
public Collator getCollator() { return collator; }
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
if (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) {
|
||||
return TermsEnum.EMPTY;
|
||||
}
|
||||
|
|
|
@ -27,14 +27,7 @@ import java.util.Collection;
|
|||
|
||||
public final class ArrayUtil {
|
||||
|
||||
/**
|
||||
* @deprecated This constructor was not intended to be public and should not be used.
|
||||
* This class contains solely a static utility methods.
|
||||
* It will be made private in Lucene 4.0
|
||||
*/
|
||||
// make private in 4.0!
|
||||
@Deprecated
|
||||
public ArrayUtil() {} // no instance
|
||||
private ArrayUtil() {} // no instance
|
||||
|
||||
/*
|
||||
Begin Apache Harmony code
|
||||
|
@ -247,6 +240,19 @@ public final class ArrayUtil {
|
|||
public static short[] grow(short[] array) {
|
||||
return grow(array, 1 + array.length);
|
||||
}
|
||||
|
||||
public static float[] grow(float[] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
float[] newArray = new float[oversize(minSize, RamUsageEstimator.NUM_BYTES_FLOAT)];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
} else
|
||||
return array;
|
||||
}
|
||||
|
||||
public static float[] grow(float[] array) {
|
||||
return grow(array, 1 + array.length);
|
||||
}
|
||||
|
||||
public static short[] shrink(short[] array, int targetSize) {
|
||||
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT);
|
||||
|
|
|
@ -16,8 +16,12 @@ package org.apache.lucene.util;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
|
||||
|
||||
/* Class that Posting and PostingVector use to write byte
|
||||
/**
|
||||
* Class that Posting and PostingVector use to write byte
|
||||
* streams into shared fixed-size byte[] arrays. The idea
|
||||
* is to allocate slices of increasing lengths For
|
||||
* example, the first slice is 5 bytes, the next slice is
|
||||
|
@ -31,14 +35,10 @@ package org.apache.lucene.util;
|
|||
* the end with a non-zero byte. This way the methods
|
||||
* that are writing into the slice don't need to record
|
||||
* its length and instead allocate a new slice once they
|
||||
* hit a non-zero byte. */
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
import java.util.List;
|
||||
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
|
||||
|
||||
* hit a non-zero byte.
|
||||
*
|
||||
* @lucene.internal
|
||||
**/
|
||||
public final class ByteBlockPool {
|
||||
public final static int BYTE_BLOCK_SHIFT = 15;
|
||||
public final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
|
||||
|
@ -62,6 +62,22 @@ public final class ByteBlockPool {
|
|||
return new byte[blockSize];
|
||||
}
|
||||
}
|
||||
|
||||
public static final class DirectAllocator extends Allocator {
|
||||
|
||||
public DirectAllocator() {
|
||||
this(BYTE_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
public DirectAllocator(int blockSize) {
|
||||
super(blockSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recycleByteBlocks(byte[][] blocks, int start, int end) {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public byte[][] buffers = new byte[10][];
|
||||
|
||||
|
|
|
@ -17,13 +17,15 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
|
||||
import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
|
||||
|
||||
/**
|
||||
* {@link BytesRefHash} is a special purpose hash-map like data-structure
|
||||
|
@ -54,6 +56,14 @@ public final class BytesRefHash {
|
|||
public static final int DEFAULT_CAPACITY = 16;
|
||||
private final BytesStartArray bytesStartArray;
|
||||
private AtomicLong bytesUsed;
|
||||
|
||||
/**
|
||||
* Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a
|
||||
* {@link DirectAllocator}.
|
||||
*/
|
||||
public BytesRefHash() {
|
||||
this(new ByteBlockPool(new DirectAllocator()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link BytesRefHash}
|
||||
|
@ -75,7 +85,7 @@ public final class BytesRefHash {
|
|||
Arrays.fill(ords, -1);
|
||||
this.bytesStartArray = bytesStartArray;
|
||||
bytesStart = bytesStartArray.init();
|
||||
bytesUsed = bytesStartArray.bytesUsed();
|
||||
bytesUsed = bytesStartArray.bytesUsed() == null? new AtomicLong(0) : bytesStartArray.bytesUsed();;
|
||||
bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
||||
|
@ -143,7 +153,6 @@ public final class BytesRefHash {
|
|||
* the {@link Comparator} used for sorting
|
||||
*/
|
||||
public int[] sort(Comparator<BytesRef> comp) {
|
||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
||||
final int[] compact = compact();
|
||||
quickSort(comp, compact, 0, count - 1);
|
||||
return compact;
|
||||
|
@ -536,13 +545,13 @@ public final class BytesRefHash {
|
|||
public abstract AtomicLong bytesUsed();
|
||||
}
|
||||
|
||||
static class DirectBytesStartArray extends BytesStartArray {
|
||||
public static class DirectBytesStartArray extends BytesStartArray {
|
||||
|
||||
private final int initSize;
|
||||
protected final int initSize;
|
||||
private int[] bytesStart;
|
||||
private final AtomicLong bytesUsed = new AtomicLong(0);
|
||||
|
||||
DirectBytesStartArray(int initSize) {
|
||||
public DirectBytesStartArray(int initSize) {
|
||||
this.initSize = initSize;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,186 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestMultiTermQueryRewrites extends LuceneTestCase {
|
||||
|
||||
static Directory dir, sdir1, sdir2;
|
||||
static IndexReader reader, multiReader;
|
||||
static IndexSearcher searcher, multiSearcher;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
dir = newDirectory();
|
||||
sdir1 = newDirectory();
|
||||
sdir2 = newDirectory();
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer());
|
||||
final RandomIndexWriter swriter1 = new RandomIndexWriter(random, sdir1, new MockAnalyzer());
|
||||
final RandomIndexWriter swriter2 = new RandomIndexWriter(random, sdir2, new MockAnalyzer());
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("data", Integer.toString(i), Field.Store.NO, Field.Index.NOT_ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
((i % 2 == 0) ? swriter1 : swriter2).addDocument(doc);
|
||||
}
|
||||
writer.optimize(); swriter1.optimize(); swriter2.optimize();
|
||||
writer.close(); swriter1.close(); swriter2.close();
|
||||
|
||||
reader = IndexReader.open(dir, true);
|
||||
searcher = new IndexSearcher(reader);
|
||||
multiReader = new MultiReader(new IndexReader[] {
|
||||
IndexReader.open(sdir1, true), IndexReader.open(sdir2, true)
|
||||
}, true);
|
||||
multiSearcher = new IndexSearcher(multiReader);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() throws Exception {
|
||||
reader.close();
|
||||
multiReader.close();
|
||||
dir.close(); sdir1.close(); sdir2.close();
|
||||
reader = multiReader = null;
|
||||
searcher = multiSearcher = null;
|
||||
dir = sdir1 = sdir2 = null;
|
||||
}
|
||||
|
||||
private Query extractInnerQuery(Query q) {
|
||||
if (q instanceof ConstantScoreQuery) {
|
||||
// wrapped as ConstantScoreQuery using QueryWrapperFilter
|
||||
q = ((QueryWrapperFilter) ((ConstantScoreQuery) q).getFilter()).getQuery();
|
||||
}
|
||||
return q;
|
||||
}
|
||||
|
||||
private Term extractTerm(Query q) {
|
||||
q = extractInnerQuery(q);
|
||||
return ((TermQuery) q).getTerm();
|
||||
}
|
||||
|
||||
private void checkBooleanQueryOrder(Query q) {
|
||||
q = extractInnerQuery(q);
|
||||
final BooleanQuery bq = (BooleanQuery) q;
|
||||
Term last = null, act;
|
||||
for (BooleanClause clause : bq.clauses()) {
|
||||
act = extractTerm(clause.getQuery());
|
||||
if (last != null) {
|
||||
assertTrue("sort order of terms in BQ violated", last.compareTo(act) < 0);
|
||||
}
|
||||
last = act;
|
||||
}
|
||||
}
|
||||
|
||||
private void checkDuplicateTerms(MultiTermQuery.RewriteMethod method) throws Exception {
|
||||
final MultiTermQuery mtq = new TermRangeQuery("data", "2", "7", true, true);
|
||||
mtq.setRewriteMethod(method);
|
||||
final Query q1 = searcher.rewrite(mtq);
|
||||
final Query q2 = multiSearcher.rewrite(mtq);
|
||||
if (VERBOSE) {
|
||||
System.out.println();
|
||||
System.out.println("single segment: " + q1);
|
||||
System.out.println(" multi segment: " + q2);
|
||||
}
|
||||
assertEquals("The multi-segment case must produce same rewritten query", q1, q2);
|
||||
checkBooleanQueryOrder(q1);
|
||||
checkBooleanQueryOrder(q2);
|
||||
}
|
||||
|
||||
public void testRewritesWithDuplicateTerms() throws Exception {
|
||||
checkDuplicateTerms(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
|
||||
|
||||
checkDuplicateTerms(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
|
||||
|
||||
// use a large PQ here to only test duplicate terms and dont mix up when all scores are equal
|
||||
checkDuplicateTerms(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024));
|
||||
checkDuplicateTerms(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(1024));
|
||||
|
||||
// Test auto rewrite (but only boolean mode), so we set the limits to large values to always get a BQ
|
||||
final MultiTermQuery.ConstantScoreAutoRewrite rewrite = new MultiTermQuery.ConstantScoreAutoRewrite();
|
||||
rewrite.setTermCountCutoff(Integer.MAX_VALUE);
|
||||
rewrite.setDocCountPercent(100.);
|
||||
checkDuplicateTerms(rewrite);
|
||||
}
|
||||
|
||||
private void checkBooleanQueryBoosts(BooleanQuery bq) {
|
||||
for (BooleanClause clause : bq.clauses()) {
|
||||
final TermQuery mtq = (TermQuery) clause.getQuery();
|
||||
assertEquals("Parallel sorting of boosts in rewrite mode broken",
|
||||
Float.parseFloat(mtq.getTerm().text()), mtq.getBoost());
|
||||
}
|
||||
}
|
||||
|
||||
private void checkBoosts(MultiTermQuery.RewriteMethod method) throws Exception {
|
||||
final MultiTermQuery mtq = new MultiTermQuery("data") {
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
return new TermRangeTermsEnum(reader, field, "2", "7", true, true, null) {
|
||||
final MultiTermQuery.BoostAttribute boostAtt =
|
||||
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
|
||||
@Override
|
||||
protected AcceptStatus accept(BytesRef term) {
|
||||
boostAtt.setBoost(Float.parseFloat(term.utf8ToString()));
|
||||
return super.accept(term);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "dummy";
|
||||
}
|
||||
};
|
||||
mtq.setRewriteMethod(method);
|
||||
final Query q1 = searcher.rewrite(mtq);
|
||||
final Query q2 = multiSearcher.rewrite(mtq);
|
||||
if (VERBOSE) {
|
||||
System.out.println();
|
||||
System.out.println("single segment: " + q1);
|
||||
System.out.println(" multi segment: " + q2);
|
||||
}
|
||||
assertEquals("The multi-segment case must produce same rewritten query", q1, q2);
|
||||
checkBooleanQueryBoosts((BooleanQuery) q1);
|
||||
checkBooleanQueryBoosts((BooleanQuery) q2);
|
||||
}
|
||||
|
||||
public void testBoosts() throws Exception {
|
||||
checkBoosts(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
|
||||
|
||||
// use a large PQ here to only test boosts and dont mix up when all scores are equal
|
||||
checkBoosts(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024));
|
||||
}
|
||||
|
||||
}
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
@ -85,7 +86,7 @@ public class TestPrefixRandom extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
return new SimplePrefixTermsEnum(reader, field, prefix);
|
||||
}
|
||||
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonTestUtil;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
@ -103,7 +104,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
return new SimpleAutomatonTermsEnum(reader, field);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue