From dd1c7a858538b2716d3844bcb8f6edbe2e702cb3 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Fri, 15 Oct 2010 14:25:48 +0000 Subject: [PATCH] LUCENE-2690: MultiTermQuery boolean rewrites per segment git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1022934 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 9 +- lucene/MIGRATE.txt | 8 + .../lucene/index/memory/MemoryIndex.java | 2 +- .../lucene/search/FuzzyLikeThisQuery.java | 8 +- .../lucene/search/regex/RegexQuery.java | 3 +- .../lucene/search/regex/TestRegexQuery.java | 3 +- .../search/spell/DirectSpellChecker.java | 8 +- .../apache/lucene/search/AutomatonQuery.java | 3 +- .../org/apache/lucene/search/FuzzyQuery.java | 5 +- .../apache/lucene/search/FuzzyTermsEnum.java | 36 +- .../apache/lucene/search/MultiTermQuery.java | 493 +++++++++++++----- .../lucene/search/NumericRangeQuery.java | 3 +- .../org/apache/lucene/search/PrefixQuery.java | 3 +- .../lucene/search/QueryWrapperFilter.java | 7 +- .../apache/lucene/search/TermRangeQuery.java | 3 +- .../org/apache/lucene/util/ArrayUtil.java | 22 +- .../org/apache/lucene/util/ByteBlockPool.java | 34 +- .../org/apache/lucene/util/BytesRefHash.java | 25 +- .../search/TestMultiTermQueryRewrites.java | 186 +++++++ .../lucene/search/TestPrefixRandom.java | 3 +- .../lucene/search/TestRegexpRandom2.java | 3 +- 21 files changed, 676 insertions(+), 191 deletions(-) create mode 100644 lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b22cc2190e5..35700af4477 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -147,8 +147,10 @@ API Changes you also override this method on upgrade. (Robert Muir, Mike McCandless) -* LUCENE-2691: IndexWriter.getReader() has been made package local and is now exposed via open and reopen methods on - IndexReader. The semantics of the call is the same as it was prior to the API change. (Grant Ingersoll, Mike McCandless) +* LUCENE-2691: IndexWriter.getReader() has been made package local and is now + exposed via open and reopen methods on IndexReader. The semantics of the + call is the same as it was prior to the API change. + (Grant Ingersoll, Mike McCandless) New features @@ -265,6 +267,9 @@ New features * LUCENE-2692: Added several new SpanQuery classes for positional checking (match is in a range, payload is a specific value) (Grant Ingersoll) +* LUCENE-2690: MultiTermQuery boolean rewrites per segment. + (Uwe Schindler, Robert Muir, Mike McCandless) + Optimizations * LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching. diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 3e4d166a657..592ed610139 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -316,3 +316,11 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing * LUCENE-2691: The near-real-time API has moved from IndexWriter to IndexReader. Instead of IndexWriter.getReader(), call IndexReader.open(IndexWriter) or IndexReader.reopen(IndexWriter). + +* LUCENE-2690: MultiTermQuery boolean rewrites per segment. + Also MultiTermQuery.getTermsEnum() now takes an AttributeSource. FuzzyTermsEnum + is both consumer and producer of attributes: MTQ.BoostAttribute is + added to the FuzzyTermsEnum and MTQ's rewrite mode consumes it. + The other way round MTQ.TopTermsBooleanQueryRewrite supplys a + global AttributeSource to each segments TermsEnum. The TermsEnum is consumer + and gets the current minimum competitive boosts (MTQ.MaxNonCompetitiveBoostAttribute). diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 37e95004652..11dd692ff83 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -886,7 +886,7 @@ public class MemoryIndex implements Serializable { @Override public int docFreq() { - return info.sortedTerms[termUpto].getValue().size(); + return 1; } @Override diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java index 9b0f72b3439..5b4996a51a3 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; @@ -199,7 +200,10 @@ public class FuzzyLikeThisQuery extends Query ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore=0; Term startTerm=internSavingTemplateTerm.createTerm(term); - FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, startTerm, f.minSimilarity, f.prefixLength); + AttributeSource atts = new AttributeSource(); + MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt = + atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class); + FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, atts, startTerm, f.minSimilarity, f.prefixLength); //store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants=0; @@ -217,7 +221,7 @@ public class FuzzyLikeThisQuery extends Query variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } - boostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); + maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java index 61e26c73ada..950beded5f7 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java @@ -21,6 +21,7 @@ import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.FilteredTermsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; @@ -60,7 +61,7 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable { } @Override - protected FilteredTermsEnum getTermsEnum(IndexReader reader) throws IOException { + protected FilteredTermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { return new RegexTermsEnum(reader, term, regexImpl); } diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java index 0b03d38d369..512db84025a 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.LuceneTestCase; public class TestRegexQuery extends LuceneTestCase { @@ -78,7 +79,7 @@ public class TestRegexQuery extends LuceneTestCase { } public void testMatchAll() throws Exception { - TermsEnum terms = new RegexQuery(new Term(FN, "jum.")).getTermsEnum(searcher.getIndexReader()); + TermsEnum terms = new RegexQuery(new Term(FN, "jum.")).getTermsEnum(searcher.getIndexReader(), new AttributeSource() /*dummy*/); // no term should match assertNull(terms.next()); } diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java index 3e4f8ee0306..147d269edb8 100644 --- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.FuzzyTermsEnum; import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.LevenshteinAutomata; @@ -387,7 +388,10 @@ public class DirectSpellChecker { private Collection suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException { - FuzzyTermsEnum e = new FuzzyTermsEnum(ir, term, editDistance, Math.max(minPrefix, editDistance-1)); + AttributeSource atts = new AttributeSource(); + MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt = + atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class); + FuzzyTermsEnum e = new FuzzyTermsEnum(ir, atts, term, editDistance, Math.max(minPrefix, editDistance-1)); final PriorityQueue stQueue = new PriorityQueue(); BytesRef queryTerm = new BytesRef(term.text()); @@ -435,7 +439,7 @@ public class DirectSpellChecker { stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); - boostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); } return stQueue; diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java index 0d2d5ee2b20..e67230b6a5c 100644 --- a/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java +++ b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; @@ -85,7 +86,7 @@ public class AutomatonQuery extends MultiTermQuery { } @Override - protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { // matches nothing if (BasicOperations.isEmpty(automaton)) { return TermsEnum.EMPTY; diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java index f559728b759..b153dd9e719 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -20,6 +20,7 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.automaton.LevenshteinAutomata; @@ -135,11 +136,11 @@ public class FuzzyQuery extends MultiTermQuery { } @Override - protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { if (!termLongEnough) { // can only match if it's exact return new SingleTermsEnum(reader, term); } - return new FuzzyTermsEnum(reader, getTerm(), minimumSimilarity, prefixLength); + return new FuzzyTermsEnum(reader, atts, getTerm(), minimumSimilarity, prefixLength); } /** diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index ee1ad083a25..49861a0f2f4 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; @@ -51,7 +52,12 @@ public final class FuzzyTermsEnum extends TermsEnum { private final MultiTermQuery.BoostAttribute boostAtt = attributes().addAttribute(MultiTermQuery.BoostAttribute.class); - private float bottom = boostAtt.getMaxNonCompetitiveBoost(); + private final MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt; + + private float bottom; + private BytesRef bottomTerm; + // nocommit: chicken-and-egg + private final Comparator termComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); private final float minSimilarity; private final float scale_factor; @@ -82,7 +88,7 @@ public final class FuzzyTermsEnum extends TermsEnum { * @param prefixLength Length of required common prefix. Default value is 0. * @throws IOException */ - public FuzzyTermsEnum(IndexReader reader, Term term, + public FuzzyTermsEnum(IndexReader reader, AttributeSource atts, Term term, final float minSimilarity, final int prefixLength) throws IOException { if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity) throw new IllegalArgumentException("fractional edit distances are not allowed"); @@ -116,9 +122,10 @@ public final class FuzzyTermsEnum extends TermsEnum { } this.scale_factor = 1.0f / (1.0f - this.minSimilarity); - TermsEnum subEnum = getAutomatonEnum(maxEdits, null); - setEnum(subEnum != null ? subEnum : - new LinearFuzzyTermsEnum()); + this.maxBoostAtt = atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class); + bottom = maxBoostAtt.getMaxNonCompetitiveBoost(); + bottomTerm = maxBoostAtt.getCompetitiveTerm(); + bottomChanged(null, true); } /** @@ -169,19 +176,24 @@ public final class FuzzyTermsEnum extends TermsEnum { * fired when the max non-competitive boost has changed. this is the hook to * swap in a smarter actualEnum */ - private void bottomChanged(float boostValue, BytesRef lastTerm) + private void bottomChanged(BytesRef lastTerm, boolean init) throws IOException { int oldMaxEdits = maxEdits; + // true if the last term encountered is lexicographically equal or after the bottom term in the PQ + boolean termAfter = bottomTerm == null || (lastTerm != null && termComparator.compare(lastTerm, bottomTerm) >= 0); + // as long as the max non-competitive boost is >= the max boost // for some edit distance, keep dropping the max edit distance. - while (maxEdits > 0 && boostValue >= calculateMaxBoost(maxEdits)) + while (maxEdits > 0 && (termAfter ? bottom >= calculateMaxBoost(maxEdits) : bottom > calculateMaxBoost(maxEdits))) maxEdits--; - if (oldMaxEdits != maxEdits) { // the maximum n has changed + if (oldMaxEdits != maxEdits || init) { // the maximum n has changed TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm); if (newEnum != null) { setEnum(newEnum); + } else if (init) { + setEnum(new LinearFuzzyTermsEnum()); } } } @@ -202,16 +214,18 @@ public final class FuzzyTermsEnum extends TermsEnum { @Override public BytesRef next() throws IOException { if (queuedBottom != null) { - bottomChanged(bottom, queuedBottom); + bottomChanged(queuedBottom, false); queuedBottom = null; } BytesRef term = actualEnum.next(); boostAtt.setBoost(actualBoostAtt.getBoost()); - final float bottom = boostAtt.getMaxNonCompetitiveBoost(); - if (bottom != this.bottom && term != null) { + final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost(); + final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm(); + if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm)) { this.bottom = bottom; + this.bottomTerm = bottomTerm; // clone the term before potentially doing something with it // this is a rare but wonderful occurrence anyway queuedBottom = new BytesRef(term); diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java index ac8b713a739..2af261b39af 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -19,19 +19,30 @@ package org.apache.lucene.search; import java.io.IOException; import java.io.Serializable; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.PriorityQueue; +import java.util.Comparator; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Fields; import org.apache.lucene.index.Terms; -import org.apache.lucene.queryParser.QueryParser; // for javadoc +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; /** * An abstract {@link Query} that matches documents @@ -39,7 +50,7 @@ import org.apache.lucene.util.PagedBytes; * FilteredTermsEnum} enumeration. * *

This query cannot be used directly; you must subclass - * it and define {@link #getTermsEnum} to provide a {@link + * it and define {@link #getTermsEnum(IndexReader,AttributeSource)} to provide a {@link * FilteredTermsEnum} that iterates through the terms to be * matched. * @@ -71,34 +82,25 @@ public abstract class MultiTermQuery extends Query { protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; transient int numberOfTerms = 0; - /** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum} + /** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum(IndexReader,AttributeSource)} * and update the boost on each returned term. This enables to control the boost factor * for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or * {@link TopTermsBooleanQueryRewrite} mode. * {@link FuzzyQuery} is using this to take the edit distance into account. + *

Please note: This attribute is intended to be added only by the TermsEnum + * to itsself in its constructor and consumed by the {@link RewriteMethod}. + * @lucene.internal */ public static interface BoostAttribute extends Attribute { /** Sets the boost in this attribute */ public void setBoost(float boost); /** Retrieves the boost, default is {@code 1.0f}. */ public float getBoost(); - /** Sets the maximum boost for terms that would never get - * into the priority queue of {@link MultiTermQuery.TopTermsBooleanQueryRewrite}. - * This value is not changed by {@link AttributeImpl#clear} - * and not used in {@code equals()} and {@code hashCode()}. - * Do not change the value in the {@link TermsEnum}! - */ - public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost); - /** Retrieves the maximum boost that is not competitive, - * default is megative infinity. You can use this boost value - * as a hint when writing the {@link TermsEnum}. - */ - public float getMaxNonCompetitiveBoost(); } /** Implementation class for {@link BoostAttribute}. */ public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute { - private float boost = 1.0f, maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; + private float boost = 1.0f; public void setBoost(float boost) { this.boost = boost; @@ -107,14 +109,6 @@ public abstract class MultiTermQuery extends Query { public float getBoost() { return boost; } - - public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { - this.maxNonCompetitiveBoost = maxNonCompetitiveBoost; - } - - public float getMaxNonCompetitiveBoost() { - return maxNonCompetitiveBoost; - } @Override public void clear() { @@ -141,6 +135,83 @@ public abstract class MultiTermQuery extends Query { } } + /** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling + * {@link #getTermsEnum(IndexReader,AttributeSource)}. + * {@link FuzzyQuery} is using this to control its internal behaviour + * to only return competitive terms. + *

Please note: This attribute is intended to be added by the {@link RewriteMethod} + * to an empty {@link AttributeSource} that is shared for all segments + * during query rewrite. This attribute source is passed to all segment enums + * on {@link #getTermsEnum(IndexReader,AttributeSource)}. + * {@link TopTermsBooleanQueryRewrite} uses this attribute to + * inform all enums about the current boost, that is not competitive. + * @lucene.internal + */ + public static interface MaxNonCompetitiveBoostAttribute extends Attribute { + /** This is the maximum boost that would not be competitive. */ + public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost); + /** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */ + public float getMaxNonCompetitiveBoost(); + /** This is the term or null of the term that triggered the boost change. */ + public void setCompetitiveTerm(BytesRef competitiveTerm); + /** This is the term or null of the term that triggered the boost change. Default is null, which means every term is competitoive. */ + public BytesRef getCompetitiveTerm(); + } + + /** Implementation class for {@link MaxNonCompetitiveBoostAttribute}. */ + public static final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute { + private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; + private BytesRef competitiveTerm = null; + + public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) { + this.maxNonCompetitiveBoost = maxNonCompetitiveBoost; + } + + public float getMaxNonCompetitiveBoost() { + return maxNonCompetitiveBoost; + } + + public void setCompetitiveTerm(final BytesRef competitiveTerm) { + this.competitiveTerm = competitiveTerm; + } + + public BytesRef getCompetitiveTerm() { + return competitiveTerm; + } + + @Override + public void clear() { + maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; + competitiveTerm = null; + } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other instanceof MaxNonCompetitiveBoostAttributeImpl) { + final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other; + return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost) + && (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm)); + } + return false; + } + + @Override + public int hashCode() { + int hash = Float.floatToIntBits(maxNonCompetitiveBoost); + if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode(); + return hash; + } + + @Override + public void copyTo(AttributeImpl target) { + final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target; + t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); + t.setCompetitiveTerm(competitiveTerm); + } + } + /** Abstract class that defines how the query is rewritten. */ public static abstract class RewriteMethod implements Serializable { public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException; @@ -177,69 +248,85 @@ public abstract class MultiTermQuery extends Query { private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - final Fields fields = MultiFields.getFields(reader); - if (fields == null) { - // reader has no fields - return 0; - } - - final Terms terms = fields.terms(query.field); - if (terms == null) { - // field does not exist - return 0; - } - - final TermsEnum termsEnum = query.getTermsEnum(reader); - assert termsEnum != null; - - if (termsEnum == TermsEnum.EMPTY) - return 0; - final BoostAttribute boostAtt = - termsEnum.attributes().addAttribute(BoostAttribute.class); - collector.boostAtt = boostAtt; + final List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, reader); int count = 0; - BytesRef bytes; - while ((bytes = termsEnum.next()) != null) { - if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) { - termsEnum.cacheCurrentTerm(); - count++; - } else { - break; + Comparator lastTermComp = null; + + for (IndexReader r : subReaders) { + final Fields fields = r.fields(); + if (fields == null) { + // reader has no fields + continue; + } + + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + continue; + } + + final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes); + assert termsEnum != null; + + if (termsEnum == TermsEnum.EMPTY) + continue; + + // Check comparator compatibility: + final Comparator newTermComp = termsEnum.getComparator(); + if (lastTermComp != null && newTermComp != lastTermComp) + throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp); + lastTermComp = newTermComp; + + collector.setNextEnum(termsEnum); + BytesRef bytes; + while ((bytes = termsEnum.next()) != null) { + if (collector.collect(bytes)) { + termsEnum.cacheCurrentTerm(); + count++; + } else { + return count; // interrupt whole term collection, so also don't iterate other subReaders + } } } - collector.boostAtt = null; return count; } protected static abstract class TermCollector { - private BoostAttribute boostAtt = null; + /** attributes used for communication with the enum */ + public final AttributeSource attributes = new AttributeSource(); /** return false to stop collecting */ - public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException; + public abstract boolean collect(BytesRef bytes) throws IOException; - /** set the minimum boost as a hint for the term producer */ - protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { - assert boostAtt != null; - boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); - } + /** the next segment's {@link TermsEnum} that is used to collect terms */ + public abstract void setNextEnum(TermsEnum termsEnum) throws IOException; } } private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite { @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { - final BooleanQuery result = new BooleanQuery(true); + final ParallelArraysTermCollector col = new ParallelArraysTermCollector(); + collectTerms(reader, query, col); + final Term placeholderTerm = new Term(query.field); - query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { - @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { - // add new TQ, we must clone the term, else it may get overwritten! - TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq()); - tq.setBoost(query.getBoost() * boost); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query - return true; + final BooleanQuery result = new BooleanQuery(true); + final int size = col.terms.size(); + if (size > 0) { + final int sort[] = col.terms.sort(col.termsEnum.getComparator()); + final int[] docFreq = col.array.docFreq; + final float[] boost = col.array.boost; + for (int i = 0; i < size; i++) { + final int pos = sort[i]; + final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef())); + assert reader.docFreq(term) == docFreq[pos]; + final TermQuery tq = new TermQuery(term, docFreq[pos]); + tq.setBoost(query.getBoost() * boost[pos]); + result.add(tq, BooleanClause.Occur.SHOULD); } - })); + } + query.incTotalNumberOfTerms(size); return result; } @@ -247,6 +334,75 @@ public abstract class MultiTermQuery extends Query { protected Object readResolve() { return SCORING_BOOLEAN_QUERY_REWRITE; } + + static final class ParallelArraysTermCollector extends TermCollector { + final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16); + final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array); + TermsEnum termsEnum; + + private BoostAttribute boostAtt; + + @Override + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + } + + @Override + public boolean collect(BytesRef bytes) { + final int e = terms.add(bytes); + if (e < 0 ) { + // duplicate term: update docFreq + final int pos = (-e)-1; + array.docFreq[pos] += termsEnum.docFreq(); + assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; + } else { + // new entry: we populate the entry initially + array.docFreq[e] = termsEnum.docFreq(); + array.boost[e] = boostAtt.getBoost(); + } + // if the new entry reaches the max clause count, we exit early + if (e >= BooleanQuery.getMaxClauseCount()) + throw new BooleanQuery.TooManyClauses(); + return true; + } + } + + /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ + static final class TermFreqBoostByteStart extends DirectBytesStartArray { + int[] docFreq; + float[] boost; + + public TermFreqBoostByteStart(int initSize) { + super(initSize); + } + + @Override + public int[] init() { + final int[] ord = super.init(); + boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)]; + docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)]; + assert boost.length >= ord.length && docFreq.length >= ord.length; + return ord; + } + + @Override + public int[] grow() { + final int[] ord = super.grow(); + docFreq = ArrayUtil.grow(docFreq, ord.length); + boost = ArrayUtil.grow(boost, ord.length); + assert boost.length >= ord.length && docFreq.length >= ord.length; + return ord; + } + + @Override + public int[] clear() { + boost = null; + docFreq = null; + return super.clear(); + } + + } } /** A rewrite method that first translates each term into @@ -291,44 +447,92 @@ public abstract class MultiTermQuery extends Query { final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue stQueue = new PriorityQueue(); collectTerms(reader, query, new TermCollector() { + private final MaxNonCompetitiveBoostAttribute maxBoostAtt = + attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class); + + private final Map visitedTerms = new HashMap(); + + private TermsEnum termsEnum; + private Comparator termComp; + private BoostAttribute boostAtt; + private ScoreTerm st; + @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + this.termComp = termsEnum.getComparator(); + // lazy init the initial ScoreTerm because comparator is not known on ctor: + if (st == null) + st = new ScoreTerm(this.termComp); + boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + } + + @Override + public boolean collect(BytesRef bytes) { + final float boost = boostAtt.getBoost(); // ignore uncompetetive hits - if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) - return true; - // add new entry in PQ, we must clone the term, else it may get overwritten! - st.bytes.copy(bytes); - st.boost = boost; - st.docFreq = termsEnum.docFreq(); - stQueue.offer(st); - // possibly drop entries from queue - st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); - setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + if (stQueue.size() == maxSize) { + final ScoreTerm t = stQueue.peek(); + if (boost < t.boost) + return true; + if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) + return true; + } + ScoreTerm t = visitedTerms.get(bytes); + if (t != null) { + // if the term is already in the PQ, only update docFreq of term in PQ + t.docFreq += termsEnum.docFreq(); + assert t.boost == boost : "boost should be equal in all segment TermsEnums"; + } else { + // add new entry in PQ, we must clone the term, else it may get overwritten! + st.bytes.copy(bytes); + st.boost = boost; + st.docFreq = termsEnum.docFreq(); + visitedTerms.put(st.bytes, st); + stQueue.offer(st); + // possibly drop entries from queue + if (stQueue.size() > maxSize) { + st = stQueue.poll(); + visitedTerms.remove(st.bytes); + } else { + st = new ScoreTerm(termComp); + } + assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; + // set maxBoostAtt with values to help FuzzyTermsEnum to optimize + if (stQueue.size() == maxSize) { + t = stQueue.peek(); + maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); + maxBoostAtt.setCompetitiveTerm(t.bytes); + } + } return true; } - - // reusable instance - private ScoreTerm st = new ScoreTerm(); }); final Term placeholderTerm = new Term(query.field); final BooleanQuery bq = new BooleanQuery(true); - for (final ScoreTerm st : stQueue) { - // add new query, we must clone the term, else it may get overwritten! - Query tq = getQuery(placeholderTerm.createTerm(st.bytes), st.docFreq); + final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); + Arrays.sort(scoreTerms, new Comparator() { + public int compare(ScoreTerm st1, ScoreTerm st2) { + assert st1.termComp == st2.termComp : + "term comparator should not change between segments"; + return st1.termComp.compare(st1.bytes, st2.bytes); + } + }); + for (final ScoreTerm st : scoreTerms) { + final Term term = placeholderTerm.createTerm(st.bytes); + assert reader.docFreq(term) == st.docFreq; + Query tq = getQuery(term, st.docFreq); tq.setBoost(query.getBoost() * st.boost); // set the boost bq.add(tq, BooleanClause.Occur.SHOULD); // add to query } - query.incTotalNumberOfTerms(bq.clauses().size()); + query.incTotalNumberOfTerms(scoreTerms.length); return bq; } @Override public int hashCode() { - final int prime = 17; - int result = 1; - result = prime * result + size; - return result; + return 31 * size; } @Override @@ -341,15 +545,20 @@ public abstract class MultiTermQuery extends Query { return true; } - private static class ScoreTerm implements Comparable { + static final class ScoreTerm implements Comparable { + public final Comparator termComp; + public final BytesRef bytes = new BytesRef(); public float boost; public int docFreq; + public ScoreTerm(Comparator termComp) { + this.termComp = termComp; + } + public int compareTo(ScoreTerm other) { if (this.boost == other.boost) - // TODO: is it OK to use default compare here? - return other.bytes.compareTo(this.bytes); + return termComp.compare(other.bytes, this.bytes); else return Float.compare(this.boost, other.boost); } @@ -362,8 +571,8 @@ public abstract class MultiTermQuery extends Query { * scores as computed by the query. * *

- * This rewrite mode only uses the top scoring terms so it will not overflow - * the boolean max clause count. It is the default rewrite mode for + * This rewrite method only uses the top scoring terms so it will not overflow + * the boolean max clause count. It is the default rewrite method for * {@link FuzzyQuery}. * * @see #setRewriteMethod @@ -510,63 +719,61 @@ public abstract class MultiTermQuery extends Query { final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit); + final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); collectTerms(reader, query, col); - + final int size = col.pendingTerms.size(); if (col.hasCutOff) { return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); - } else if (col.termCount == 0) { + } else if (size == 0) { return new BooleanQuery(true); } else { - final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false); - try { - final BooleanQuery bq = new BooleanQuery(true); - final Term placeholderTerm = new Term(query.field); - long start = col.startOffset; - for(int i = 0; i < col.termCount; i++) { - final BytesRef bytes = new BytesRef(); - start = bytesReader.fillUsingLengthPrefix3(bytes, start); - bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD); - } - // Strip scores - final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); - query.incTotalNumberOfTerms(col.termCount); - return result; - } finally { - bytesReader.close(); + final BooleanQuery bq = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); + final BytesRefHash pendingTerms = col.pendingTerms; + final int sort[] = pendingTerms.sort(col.termsEnum.getComparator()); + for(int i = 0; i < size; i++) { + // docFreq is not used for constant score here, we pass 1 + // to explicitely set a fake value, so it's not calculated + bq.add(new TermQuery( + placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1 + ), BooleanClause.Occur.SHOULD); } + // Strip scores + final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result.setBoost(query.getBoost()); + query.incTotalNumberOfTerms(size); + return result; } } - private static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) { - this.reader = reader; - this.field = field; + static final class CutOffTermCollector extends TermCollector { + CutOffTermCollector(int docCountCutoff, int termCountLimit) { this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException { - termCount++; - if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { + @Override + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + } + + @Override + public boolean collect(BytesRef bytes) throws IOException { + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } - pendingTerms.copyUsingLengthPrefix(bytes); + pendingTerms.add(bytes); docVisitCount += termsEnum.docFreq(); return true; } int docVisitCount = 0; boolean hasCutOff = false; - int termCount = 0; - - final IndexReader reader; - final String field; + TermsEnum termsEnum; + final int docCountCutoff, termCountLimit; - final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB - final long startOffset = pendingTerms.getPointer(); + final BytesRefHash pendingTerms = new BytesRefHash(); } @Override @@ -644,8 +851,20 @@ public abstract class MultiTermQuery extends Query { * field does exist). This method should not return null * (should instead return {@link TermsEnum#EMPTY} if no * terms match). The TermsEnum must already be - * positioned to the first matching term. */ - protected abstract TermsEnum getTermsEnum(IndexReader reader) throws IOException; + * positioned to the first matching term. + * The given {@link AttributeSource} is passed by the {@link RewriteMethod} to + * provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts. + * This is currently only used by {@link TopTermsBooleanQueryRewrite} + */ + protected abstract TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException; + + /** Convenience method, if no attributes are needed: + * This simply passes empty attributes and is equal to: + * getTermsEnum(reader, new AttributeSource()) + */ + protected final TermsEnum getTermsEnum(IndexReader reader) throws IOException { + return getTermsEnum(reader, new AttributeSource()); + } /** * Expert: Return the number of unique terms visited during execution of the query. diff --git a/lucene/src/java/org/apache/lucene/search/NumericRangeQuery.java b/lucene/src/java/org/apache/lucene/search/NumericRangeQuery.java index 89eff58fb6b..ad5d631a607 100644 --- a/lucene/src/java/org/apache/lucene/search/NumericRangeQuery.java +++ b/lucene/src/java/org/apache/lucene/search/NumericRangeQuery.java @@ -26,6 +26,7 @@ import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.TermsEnum; @@ -301,7 +302,7 @@ public final class NumericRangeQuery extends MultiTermQuery { } @Override @SuppressWarnings("unchecked") - protected TermsEnum getTermsEnum(final IndexReader reader) throws IOException { + protected TermsEnum getTermsEnum(final IndexReader reader, AttributeSource atts) throws IOException { // very strange: java.lang.Number itsself is not Comparable, but all subclasses used here are return (min != null && max != null && ((Comparable) min).compareTo(max) > 0) ? TermsEnum.EMPTY : diff --git a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java index e3b08bd2050..03caee8ab10 100644 --- a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing terms with a specified prefix. A PrefixQuery @@ -45,7 +46,7 @@ public class PrefixQuery extends MultiTermQuery { public Term getPrefix() { return prefix; } @Override - protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { if (prefix.bytes().length == 0) { // no prefix -- match all terms for this field: // NOTE: for now, MultiTermQuery enums terms at the diff --git a/lucene/src/java/org/apache/lucene/search/QueryWrapperFilter.java b/lucene/src/java/org/apache/lucene/search/QueryWrapperFilter.java index 7001dd4a732..3aa6d4d2245 100644 --- a/lucene/src/java/org/apache/lucene/search/QueryWrapperFilter.java +++ b/lucene/src/java/org/apache/lucene/search/QueryWrapperFilter.java @@ -33,7 +33,7 @@ import org.apache.lucene.index.IndexReader; * once per day. */ public class QueryWrapperFilter extends Filter { - private Query query; + private final Query query; /** Constructs a filter which only matches documents matching * query. @@ -41,6 +41,11 @@ public class QueryWrapperFilter extends Filter { public QueryWrapperFilter(Query query) { this.query = query; } + + /** returns the inner Query */ + public final Query getQuery() { + return query; + } @Override public DocIdSet getDocIdSet(final IndexReader reader) throws IOException { diff --git a/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java b/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java index 394145a1a48..9e4a01b367f 100644 --- a/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermRangeQuery.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.AttributeSource; /** * A Query that matches documents within an range of terms. @@ -130,7 +131,7 @@ public class TermRangeQuery extends MultiTermQuery { public Collator getCollator() { return collator; } @Override - protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { if (collator == null && lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) { return TermsEnum.EMPTY; } diff --git a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java index 151ae1a95db..8797ca1df00 100644 --- a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java @@ -27,14 +27,7 @@ import java.util.Collection; public final class ArrayUtil { - /** - * @deprecated This constructor was not intended to be public and should not be used. - * This class contains solely a static utility methods. - * It will be made private in Lucene 4.0 - */ - // make private in 4.0! - @Deprecated - public ArrayUtil() {} // no instance + private ArrayUtil() {} // no instance /* Begin Apache Harmony code @@ -247,6 +240,19 @@ public final class ArrayUtil { public static short[] grow(short[] array) { return grow(array, 1 + array.length); } + + public static float[] grow(float[] array, int minSize) { + if (array.length < minSize) { + float[] newArray = new float[oversize(minSize, RamUsageEstimator.NUM_BYTES_FLOAT)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static float[] grow(float[] array) { + return grow(array, 1 + array.length); + } public static short[] shrink(short[] array, int targetSize) { final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT); diff --git a/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java index a34e9fe7a3c..552340eca6f 100644 --- a/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java +++ b/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java @@ -16,8 +16,12 @@ package org.apache.lucene.util; * See the License for the specific language governing permissions and * limitations under the License. */ +import java.util.Arrays; +import java.util.List; +import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; -/* Class that Posting and PostingVector use to write byte +/** + * Class that Posting and PostingVector use to write byte * streams into shared fixed-size byte[] arrays. The idea * is to allocate slices of increasing lengths For * example, the first slice is 5 bytes, the next slice is @@ -31,14 +35,10 @@ package org.apache.lucene.util; * the end with a non-zero byte. This way the methods * that are writing into the slice don't need to record * its length and instead allocate a new slice once they - * hit a non-zero byte. */ - -import java.util.Arrays; - - -import java.util.List; -import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; - + * hit a non-zero byte. + * + * @lucene.internal + **/ public final class ByteBlockPool { public final static int BYTE_BLOCK_SHIFT = 15; public final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT; @@ -62,6 +62,22 @@ public final class ByteBlockPool { return new byte[blockSize]; } } + + public static final class DirectAllocator extends Allocator { + + public DirectAllocator() { + this(BYTE_BLOCK_SIZE); + } + + public DirectAllocator(int blockSize) { + super(blockSize); + } + + @Override + public void recycleByteBlocks(byte[][] blocks, int start, int end) { + } + + } public byte[][] buffers = new byte[10][]; diff --git a/lucene/src/java/org/apache/lucene/util/BytesRefHash.java b/lucene/src/java/org/apache/lucene/util/BytesRefHash.java index 0976c8c61a0..a5940edc764 100644 --- a/lucene/src/java/org/apache/lucene/util/BytesRefHash.java +++ b/lucene/src/java/org/apache/lucene/util/BytesRefHash.java @@ -17,13 +17,15 @@ package org.apache.lucene.util; * limitations under the License. */ +import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK; +import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT; +import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; + import java.util.Arrays; import java.util.Comparator; import java.util.concurrent.atomic.AtomicLong; -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK; -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT; +import org.apache.lucene.util.ByteBlockPool.DirectAllocator; /** * {@link BytesRefHash} is a special purpose hash-map like data-structure @@ -54,6 +56,14 @@ public final class BytesRefHash { public static final int DEFAULT_CAPACITY = 16; private final BytesStartArray bytesStartArray; private AtomicLong bytesUsed; + + /** + * Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a + * {@link DirectAllocator}. + */ + public BytesRefHash() { + this(new ByteBlockPool(new DirectAllocator())); + } /** * Creates a new {@link BytesRefHash} @@ -75,7 +85,7 @@ public final class BytesRefHash { Arrays.fill(ords, -1); this.bytesStartArray = bytesStartArray; bytesStart = bytesStartArray.init(); - bytesUsed = bytesStartArray.bytesUsed(); + bytesUsed = bytesStartArray.bytesUsed() == null? new AtomicLong(0) : bytesStartArray.bytesUsed();; bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT); } @@ -143,7 +153,6 @@ public final class BytesRefHash { * the {@link Comparator} used for sorting */ public int[] sort(Comparator comp) { - assert bytesStart != null : "Bytesstart is null - not initialized"; final int[] compact = compact(); quickSort(comp, compact, 0, count - 1); return compact; @@ -536,13 +545,13 @@ public final class BytesRefHash { public abstract AtomicLong bytesUsed(); } - static class DirectBytesStartArray extends BytesStartArray { + public static class DirectBytesStartArray extends BytesStartArray { - private final int initSize; + protected final int initSize; private int[] bytesStart; private final AtomicLong bytesUsed = new AtomicLong(0); - DirectBytesStartArray(int initSize) { + public DirectBytesStartArray(int initSize) { this.initSize = initSize; } diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java b/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java new file mode 100644 index 00000000000..8b9f5c94d06 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java @@ -0,0 +1,186 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.io.IOException; + +public class TestMultiTermQueryRewrites extends LuceneTestCase { + + static Directory dir, sdir1, sdir2; + static IndexReader reader, multiReader; + static IndexSearcher searcher, multiSearcher; + + @BeforeClass + public static void beforeClass() throws Exception { + dir = newDirectory(); + sdir1 = newDirectory(); + sdir2 = newDirectory(); + final RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer()); + final RandomIndexWriter swriter1 = new RandomIndexWriter(random, sdir1, new MockAnalyzer()); + final RandomIndexWriter swriter2 = new RandomIndexWriter(random, sdir2, new MockAnalyzer()); + + for (int i = 0; i < 10; i++) { + Document doc = new Document(); + doc.add(newField("data", Integer.toString(i), Field.Store.NO, Field.Index.NOT_ANALYZED)); + writer.addDocument(doc); + ((i % 2 == 0) ? swriter1 : swriter2).addDocument(doc); + } + writer.optimize(); swriter1.optimize(); swriter2.optimize(); + writer.close(); swriter1.close(); swriter2.close(); + + reader = IndexReader.open(dir, true); + searcher = new IndexSearcher(reader); + multiReader = new MultiReader(new IndexReader[] { + IndexReader.open(sdir1, true), IndexReader.open(sdir2, true) + }, true); + multiSearcher = new IndexSearcher(multiReader); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + multiReader.close(); + dir.close(); sdir1.close(); sdir2.close(); + reader = multiReader = null; + searcher = multiSearcher = null; + dir = sdir1 = sdir2 = null; + } + + private Query extractInnerQuery(Query q) { + if (q instanceof ConstantScoreQuery) { + // wrapped as ConstantScoreQuery using QueryWrapperFilter + q = ((QueryWrapperFilter) ((ConstantScoreQuery) q).getFilter()).getQuery(); + } + return q; + } + + private Term extractTerm(Query q) { + q = extractInnerQuery(q); + return ((TermQuery) q).getTerm(); + } + + private void checkBooleanQueryOrder(Query q) { + q = extractInnerQuery(q); + final BooleanQuery bq = (BooleanQuery) q; + Term last = null, act; + for (BooleanClause clause : bq.clauses()) { + act = extractTerm(clause.getQuery()); + if (last != null) { + assertTrue("sort order of terms in BQ violated", last.compareTo(act) < 0); + } + last = act; + } + } + + private void checkDuplicateTerms(MultiTermQuery.RewriteMethod method) throws Exception { + final MultiTermQuery mtq = new TermRangeQuery("data", "2", "7", true, true); + mtq.setRewriteMethod(method); + final Query q1 = searcher.rewrite(mtq); + final Query q2 = multiSearcher.rewrite(mtq); + if (VERBOSE) { + System.out.println(); + System.out.println("single segment: " + q1); + System.out.println(" multi segment: " + q2); + } + assertEquals("The multi-segment case must produce same rewritten query", q1, q2); + checkBooleanQueryOrder(q1); + checkBooleanQueryOrder(q2); + } + + public void testRewritesWithDuplicateTerms() throws Exception { + checkDuplicateTerms(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + + checkDuplicateTerms(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + + // use a large PQ here to only test duplicate terms and dont mix up when all scores are equal + checkDuplicateTerms(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024)); + checkDuplicateTerms(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(1024)); + + // Test auto rewrite (but only boolean mode), so we set the limits to large values to always get a BQ + final MultiTermQuery.ConstantScoreAutoRewrite rewrite = new MultiTermQuery.ConstantScoreAutoRewrite(); + rewrite.setTermCountCutoff(Integer.MAX_VALUE); + rewrite.setDocCountPercent(100.); + checkDuplicateTerms(rewrite); + } + + private void checkBooleanQueryBoosts(BooleanQuery bq) { + for (BooleanClause clause : bq.clauses()) { + final TermQuery mtq = (TermQuery) clause.getQuery(); + assertEquals("Parallel sorting of boosts in rewrite mode broken", + Float.parseFloat(mtq.getTerm().text()), mtq.getBoost()); + } + } + + private void checkBoosts(MultiTermQuery.RewriteMethod method) throws Exception { + final MultiTermQuery mtq = new MultiTermQuery("data") { + @Override + protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { + return new TermRangeTermsEnum(reader, field, "2", "7", true, true, null) { + final MultiTermQuery.BoostAttribute boostAtt = + attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + + @Override + protected AcceptStatus accept(BytesRef term) { + boostAtt.setBoost(Float.parseFloat(term.utf8ToString())); + return super.accept(term); + } + }; + } + + @Override + public String toString(String field) { + return "dummy"; + } + }; + mtq.setRewriteMethod(method); + final Query q1 = searcher.rewrite(mtq); + final Query q2 = multiSearcher.rewrite(mtq); + if (VERBOSE) { + System.out.println(); + System.out.println("single segment: " + q1); + System.out.println(" multi segment: " + q2); + } + assertEquals("The multi-segment case must produce same rewritten query", q1, q2); + checkBooleanQueryBoosts((BooleanQuery) q1); + checkBooleanQueryBoosts((BooleanQuery) q2); + } + + public void testBoosts() throws Exception { + checkBoosts(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + + // use a large PQ here to only test boosts and dont mix up when all scores are equal + checkBoosts(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(1024)); + } + +} diff --git a/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java b/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java index edf912e1c24..1c402ead66c 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java +++ b/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -85,7 +86,7 @@ public class TestPrefixRandom extends LuceneTestCase { } @Override - protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { return new SimplePrefixTermsEnum(reader, field, prefix); } diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java index 836ab46b84d..bce9eedb061 100644 --- a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java @@ -36,6 +36,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -103,7 +104,7 @@ public class TestRegexpRandom2 extends LuceneTestCase { } @Override - protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { + protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { return new SimpleAutomatonTermsEnum(reader, field); }