From ac71ebc23716010a7397d98da0f87a1f8e29bca5 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Sun, 14 Nov 2010 23:13:46 +0000 Subject: [PATCH] LUCENE-2754, LUCENE-2757: Added a wrapper around MultiTermQueries to add span support: SpanMultiTermQueryWrapper. Using this wrapper its easy to add fuzzy/wildcard to e.g. a SpanNearQuery. This patch also refactors all RewriteMethods and Attributes in MTQ (class was unmaintainable). git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1035096 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + lucene/contrib/CHANGES.txt | 4 + .../lucene/search/FuzzyLikeThisQuery.java | 8 +- .../lucene/search/regex/SpanRegexQuery.java | 106 +-- .../search/regex/TestSpanRegexQuery.java | 64 +- .../search/spell/DirectSpellChecker.java | 11 +- .../apache/lucene/search/BoostAttribute.java | 38 + .../lucene/search/BoostAttributeImpl.java | 60 ++ .../search/ConstantScoreAutoRewrite.java | 186 +++++ .../apache/lucene/search/FuzzyTermsEnum.java | 21 +- .../MaxNonCompetitiveBoostAttribute.java | 45 ++ .../MaxNonCompetitiveBoostAttributeImpl.java | 78 ++ .../apache/lucene/search/MultiTermQuery.java | 687 ++---------------- .../apache/lucene/search/ScoringRewrite.java | 203 ++++++ .../lucene/search/TermCollectingRewrite.java | 93 +++ .../apache/lucene/search/TopTermsRewrite.java | 182 +++++ .../spans/SpanMultiTermQueryWrapper.java | 234 ++++++ .../lucene/search/spans/SpanOrQuery.java | 18 +- .../search/TestMultiTermQueryRewrites.java | 4 +- .../spans/TestSpanMultiTermQueryWrapper.java | 92 +++ 20 files changed, 1376 insertions(+), 762 deletions(-) create mode 100644 lucene/src/java/org/apache/lucene/search/BoostAttribute.java create mode 100644 lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java create mode 100644 lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java create mode 100644 lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttribute.java create mode 100644 lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttributeImpl.java create mode 100644 lucene/src/java/org/apache/lucene/search/ScoringRewrite.java create mode 100644 lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java create mode 100644 lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java create mode 100644 lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java create mode 100644 lucene/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 404785995fb..fdde3ce1aed 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -716,6 +716,10 @@ New features * LUCENE-2671: Add SortField.setMissingValue( v ) to enable sorting behavior for documents that do not include the given field. (ryan) +* LUCENE-2754, LUCENE-2757: Added a wrapper around MultiTermQueries + to add span support: SpanMultiTermQueryWrapper. + Using this wrapper its easy to add fuzzy/wildcard to e.g. a SpanNearQuery. + (Robert Muir, Uwe Schindler) Optimizations diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 63b253af23d..aa0dbf0e78f 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -177,6 +177,10 @@ API Changes QueryNodeProcessorPipeline now implements the List interface, this is useful if you want to extend or modify an existing pipeline. (Adriano Crestani via Robert Muir) + * LUCENE-2754, LUCENE-2757: Deprecated SpanRegexQuery. Use + new SpanMultiTermQueryWrapper(new RegexQuery()) instead. + (Robert Muir, Uwe Schindler) + New features * LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser. diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java index 5b4996a51a3..9f834f9b5a4 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java @@ -201,16 +201,16 @@ public class FuzzyLikeThisQuery extends Query float minScore=0; Term startTerm=internSavingTemplateTerm.createTerm(term); AttributeSource atts = new AttributeSource(); - MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt = - atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class); + MaxNonCompetitiveBoostAttribute maxBoostAtt = + atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, atts, startTerm, f.minSimilarity, f.prefixLength); //store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants=0; int totalVariantDocFreqs=0; BytesRef possibleMatch; - MultiTermQuery.BoostAttribute boostAtt = - fe.attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + BoostAttribute boostAtt = + fe.attributes().addAttribute(BoostAttribute.class); while ((possibleMatch = fe.next()) != null) { if (possibleMatch!=null) { numVariants++; diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java index aed0521dfc0..818a9497deb 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java @@ -18,115 +18,29 @@ package org.apache.lucene.search.regex; */ import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.spans.SpanOrQuery; -import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.search.spans.SpanTermQuery; -import org.apache.lucene.search.spans.Spans; -import org.apache.lucene.util.ToStringUtils; - -import java.io.IOException; -import java.util.Collection; -import java.util.ArrayList; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; /** * A SpanQuery version of {@link RegexQuery} allowing regular expression * queries to be nested within other SpanQuery subclasses. + * @deprecated Use new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead. + * This query will be removed in Lucene 4.0 */ -public class SpanRegexQuery extends SpanQuery implements RegexQueryCapable { - private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities(); - private Term term; +@Deprecated +public class SpanRegexQuery extends SpanMultiTermQueryWrapper implements RegexQueryCapable { + private final RegexCapabilities regexImpl = new JavaUtilRegexCapabilities(); public SpanRegexQuery(Term term) { - this.term = term; + super(new RegexQuery(term)); } - public Term getTerm() { return term; } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - RegexQuery orig = new RegexQuery(term); - orig.setRegexImplementation(regexImpl); - orig.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - BooleanQuery bq = (BooleanQuery) orig.rewrite(reader); - - BooleanClause[] clauses = bq.getClauses(); - SpanQuery[] sqs = new SpanQuery[clauses.length]; - for (int i = 0; i < clauses.length; i++) { - BooleanClause clause = clauses[i]; - - // Clauses from RegexQuery.rewrite are always TermQuery's - TermQuery tq = (TermQuery) clause.getQuery(); - - sqs[i] = new SpanTermQuery(tq.getTerm()); - sqs[i].setBoost(tq.getBoost()); - } - - SpanOrQuery query = new SpanOrQuery(sqs); - query.setBoost(orig.getBoost()); - - return query; - } - - @Override - public Spans getSpans(IndexReader reader) throws IOException { - throw new UnsupportedOperationException("Query should have been rewritten"); - } - - @Override - public String getField() { - return term.field(); - } - - public Collection getTerms() { - Collection terms = new ArrayList(); - terms.add(term); - return terms; - } - - /* generated by IntelliJ IDEA */ - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - final SpanRegexQuery that = (SpanRegexQuery) o; - - if (!regexImpl.equals(that.regexImpl)) return false; - if (!term.equals(that.term)) return false; - - return true; - } - - /* generated by IntelliJ IDEA */ - @Override - public int hashCode() { - int result; - result = regexImpl.hashCode(); - result = 29 * result + term.hashCode(); - return result; - } - - @Override - public String toString(String field) { - StringBuilder buffer = new StringBuilder(); - buffer.append("spanRegexQuery("); - buffer.append(term); - buffer.append(")"); - buffer.append(ToStringUtils.boost(getBoost())); - return buffer.toString(); - } + public Term getTerm() { return query.getTerm(); } public void setRegexImplementation(RegexCapabilities impl) { - this.regexImpl = impl; + query.setRegexImplementation(impl); } public RegexCapabilities getRegexImplementation() { - return regexImpl; + return query.getRegexImplementation(); } } diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java index d161fefa3a6..762478686ac 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiSearcher; import org.apache.lucene.search.spans.SpanFirstQuery; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.store.Directory; @@ -73,6 +74,65 @@ public class TestSpanRegexQuery extends LuceneTestCase { writer.optimize(); writer.close(); + IndexSearcher searcher = new IndexSearcher(directory, true); + SpanQuery srq = new SpanMultiTermQueryWrapper(new RegexQuery(new Term("field", "aut.*"))); + SpanFirstQuery sfq = new SpanFirstQuery(srq, 1); + // SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, + // true); + int numHits = searcher.search(sfq, null, 1000).totalHits; + assertEquals(1, numHits); + searcher.close(); + directory.close(); + } + + public void testSpanRegexBug() throws CorruptIndexException, IOException { + createRAMDirectories(); + + SpanQuery srq = new SpanMultiTermQueryWrapper(new RegexQuery(new Term("field", "a.*"))); + SpanQuery stq = new SpanMultiTermQueryWrapper(new RegexQuery(new Term("field", "b.*"))); + SpanNearQuery query = new SpanNearQuery(new SpanQuery[] { srq, stq }, 6, + true); + + // 1. Search the same store which works + IndexSearcher[] arrSearcher = new IndexSearcher[2]; + arrSearcher[0] = new IndexSearcher(indexStoreA, true); + arrSearcher[1] = new IndexSearcher(indexStoreB, true); + MultiSearcher searcher = new MultiSearcher(arrSearcher); + int numHits = searcher.search(query, null, 1000).totalHits; + arrSearcher[0].close(); + arrSearcher[1].close(); + + // Will fail here + // We expect 2 but only one matched + // The rewriter function only write it once on the first IndexSearcher + // So it's using term: a1 b1 to search on the second IndexSearcher + // As a result, it won't match the document in the second IndexSearcher + assertEquals(2, numHits); + indexStoreA.close(); + indexStoreB.close(); + } + + /** remove in lucene 4.0 */ + @Deprecated + public void testSpanRegexOld() throws Exception { + Directory directory = newDirectory(); + IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer())); + Document doc = new Document(); + // doc.add(newField("field", "the quick brown fox jumps over the lazy dog", + // Field.Store.NO, Field.Index.ANALYZED)); + // writer.addDocument(doc); + // doc = new Document(); + doc.add(newField("field", "auto update", Field.Store.NO, + Field.Index.ANALYZED)); + writer.addDocument(doc); + doc = new Document(); + doc.add(newField("field", "first auto update", Field.Store.NO, + Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(directory, true); SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "aut.*")); SpanFirstQuery sfq = new SpanFirstQuery(srq, 1); @@ -84,7 +144,9 @@ public class TestSpanRegexQuery extends LuceneTestCase { directory.close(); } - public void testSpanRegexBug() throws CorruptIndexException, IOException { + /** remove in lucene 4.0 */ + @Deprecated + public void testSpanRegexBugOld() throws CorruptIndexException, IOException { createRAMDirectories(); SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "a.*")); diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java index ef859101399..1fa065de4df 100644 --- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java @@ -28,7 +28,8 @@ import java.util.PriorityQueue; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.FuzzyTermsEnum; -import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; @@ -389,16 +390,16 @@ public class DirectSpellChecker { IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException { AttributeSource atts = new AttributeSource(); - MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt = - atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class); + MaxNonCompetitiveBoostAttribute maxBoostAtt = + atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); FuzzyTermsEnum e = new FuzzyTermsEnum(ir, atts, term, editDistance, Math.max(minPrefix, editDistance-1)); final PriorityQueue stQueue = new PriorityQueue(); BytesRef queryTerm = new BytesRef(term.text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); - MultiTermQuery.BoostAttribute boostAtt = - e.attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + BoostAttribute boostAtt = + e.attributes().addAttribute(BoostAttribute.class); while ((candidateTerm = e.next()) != null) { final float boost = boostAtt.getBoost(); // ignore uncompetitive hits diff --git a/lucene/src/java/org/apache/lucene/search/BoostAttribute.java b/lucene/src/java/org/apache/lucene/search/BoostAttribute.java new file mode 100644 index 00000000000..93a2d3916bf --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/BoostAttribute.java @@ -0,0 +1,38 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; // javadocs only +import org.apache.lucene.index.TermsEnum; // javadocs only + +/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)} + * and update the boost on each returned term. This enables to control the boost factor + * for each matching term in {@link MultiTermQuery#SCORING_BOOLEAN_QUERY_REWRITE} or + * {@link TopTermsRewrite} mode. + * {@link FuzzyQuery} is using this to take the edit distance into account. + *

Please note: This attribute is intended to be added only by the TermsEnum + * to itsself in its constructor and consumed by the {@link MultiTermQuery.RewriteMethod}. + * @lucene.internal + */ +public interface BoostAttribute extends Attribute { + /** Sets the boost in this attribute */ + public void setBoost(float boost); + /** Retrieves the boost, default is {@code 1.0f}. */ + public float getBoost(); +} diff --git a/lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java b/lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java new file mode 100644 index 00000000000..ad59d92c204 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/BoostAttributeImpl.java @@ -0,0 +1,60 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; + +/** Implementation class for {@link BoostAttribute}. + * @lucene.internal + */ +public final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute { + private float boost = 1.0f; + + public void setBoost(float boost) { + this.boost = boost; + } + + public float getBoost() { + return boost; + } + + @Override + public void clear() { + boost = 1.0f; + } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other instanceof BoostAttributeImpl) + return ((BoostAttributeImpl) other).boost == boost; + return false; + } + + @Override + public int hashCode() { + return Float.floatToIntBits(boost); + } + + @Override + public void copyTo(AttributeImpl target) { + ((BoostAttribute) target).setBoost(boost); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java b/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java new file mode 100644 index 00000000000..ef305d3a2b3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java @@ -0,0 +1,186 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Comparator; + +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; + +class ConstantScoreAutoRewrite extends TermCollectingRewrite { + + // Defaults derived from rough tests with a 20.0 million + // doc Wikipedia index. With more than 350 terms in the + // query, the filter method is fastest: + public static int DEFAULT_TERM_COUNT_CUTOFF = 350; + + // If the query will hit more than 1 in 1000 of the docs + // in the index (0.1%), the filter method is fastest: + public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; + + private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; + private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT; + + /** If the number of terms in this query is equal to or + * larger than this setting then {@link + * #CONSTANT_SCORE_FILTER_REWRITE} is used. */ + public void setTermCountCutoff(int count) { + termCountCutoff = count; + } + + /** @see #setTermCountCutoff */ + public int getTermCountCutoff() { + return termCountCutoff; + } + + /** If the number of documents to be visited in the + * postings exceeds this specified percentage of the + * maxDoc() for the index, then {@link + * #CONSTANT_SCORE_FILTER_REWRITE} is used. + * @param percent 0.0 to 100.0 */ + public void setDocCountPercent(double percent) { + docCountPercent = percent; + } + + /** @see #setDocCountPercent */ + public double getDocCountPercent() { + return docCountPercent; + } + + @Override + protected BooleanQuery getTopLevelQuery() { + return new BooleanQuery(true); + } + + @Override + protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/) { + topLevel.add(new TermQuery(term, docFreq), BooleanClause.Occur.SHOULD); + } + + @Override + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { + + // Get the enum and start visiting terms. If we + // exhaust the enum before hitting either of the + // cutoffs, we use ConstantBooleanQueryRewrite; else, + // ConstantFilterRewrite: + final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); + final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); + + final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); + collectTerms(reader, query, col); + final int size = col.pendingTerms.size(); + if (col.hasCutOff) { + return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); + } else if (size == 0) { + return getTopLevelQuery(); + } else { + final BooleanQuery bq = getTopLevelQuery(); + final Term placeholderTerm = new Term(query.field); + final BytesRefHash pendingTerms = col.pendingTerms; + final int sort[] = pendingTerms.sort(col.termsEnum.getComparator()); + for(int i = 0; i < size; i++) { + // docFreq is not used for constant score here, we pass 1 + // to explicitely set a fake value, so it's not calculated + addClause(bq, placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1, 1.0f); + } + // Strip scores + final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result.setBoost(query.getBoost()); + query.incTotalNumberOfTerms(size); + return result; + } + } + + static final class CutOffTermCollector extends TermCollector { + CutOffTermCollector(int docCountCutoff, int termCountLimit) { + this.docCountCutoff = docCountCutoff; + this.termCountLimit = termCountLimit; + } + + @Override + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + } + + @Override + public boolean collect(BytesRef bytes) throws IOException { + pendingTerms.add(bytes); + docVisitCount += termsEnum.docFreq(); + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { + hasCutOff = true; + return false; + } + return true; + } + + int docVisitCount = 0; + boolean hasCutOff = false; + TermsEnum termsEnum; + + final int docCountCutoff, termCountLimit; + final BytesRefHash pendingTerms = new BytesRefHash(); + } + + @Override + public int hashCode() { + final int prime = 1279; + return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent)); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; + if (other.termCountCutoff != termCountCutoff) { + return false; + } + + if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) { + return false; + } + + return true; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index 039303f9131..c8c806279b3 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -49,12 +49,12 @@ import java.util.List; */ public final class FuzzyTermsEnum extends TermsEnum { private TermsEnum actualEnum; - private MultiTermQuery.BoostAttribute actualBoostAtt; + private BoostAttribute actualBoostAtt; - private final MultiTermQuery.BoostAttribute boostAtt = - attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + private final BoostAttribute boostAtt = + attributes().addAttribute(BoostAttribute.class); - private final MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt; + private final MaxNonCompetitiveBoostAttribute maxBoostAtt; private final LevenshteinAutomataAttribute dfaAtt; private float bottom; @@ -128,7 +128,7 @@ public final class FuzzyTermsEnum extends TermsEnum { } this.scale_factor = 1.0f / (1.0f - this.minSimilarity); - this.maxBoostAtt = atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class); + this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); bottom = maxBoostAtt.getMaxNonCompetitiveBoost(); bottomTerm = maxBoostAtt.getCompetitiveTerm(); bottomChanged(null, true); @@ -174,8 +174,7 @@ public final class FuzzyTermsEnum extends TermsEnum { /** swap in a new actual enum to proxy to */ private void setEnum(TermsEnum actualEnum) { this.actualEnum = actualEnum; - this.actualBoostAtt = actualEnum.attributes().addAttribute( - MultiTermQuery.BoostAttribute.class); + this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class); } /** @@ -300,8 +299,8 @@ public final class FuzzyTermsEnum extends TermsEnum { private final BytesRef termRef; private final BytesRef lastTerm; - private final MultiTermQuery.BoostAttribute boostAtt = - attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + private final BoostAttribute boostAtt = + attributes().addAttribute(BoostAttribute.class); public AutomatonFuzzyTermsEnum(ByteRunAutomaton matchers[], BytesRef lastTerm) throws IOException { @@ -363,8 +362,8 @@ public final class FuzzyTermsEnum extends TermsEnum { // this is the text, minus the prefix private final int[] text; - private final MultiTermQuery.BoostAttribute boostAtt = - attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + private final BoostAttribute boostAtt = + attributes().addAttribute(BoostAttribute.class); /** * Constructor for enumeration of all terms from specified reader which share a prefix of diff --git a/lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttribute.java b/lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttribute.java new file mode 100644 index 00000000000..56505f0754e --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttribute.java @@ -0,0 +1,45 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; // javadocs only +import org.apache.lucene.util.BytesRef; + +/** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling + * {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}. + * {@link FuzzyQuery} is using this to control its internal behaviour + * to only return competitive terms. + *

Please note: This attribute is intended to be added by the {@link MultiTermQuery.RewriteMethod} + * to an empty {@link AttributeSource} that is shared for all segments + * during query rewrite. This attribute source is passed to all segment enums + * on {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}. + * {@link TopTermsRewrite} uses this attribute to + * inform all enums about the current boost, that is not competitive. + * @lucene.internal + */ +public interface MaxNonCompetitiveBoostAttribute extends Attribute { + /** This is the maximum boost that would not be competitive. */ + public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost); + /** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */ + public float getMaxNonCompetitiveBoost(); + /** This is the term or null of the term that triggered the boost change. */ + public void setCompetitiveTerm(BytesRef competitiveTerm); + /** This is the term or null of the term that triggered the boost change. Default is null, which means every term is competitoive. */ + public BytesRef getCompetitiveTerm(); +} diff --git a/lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttributeImpl.java b/lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttributeImpl.java new file mode 100644 index 00000000000..d7b7e951f3b --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/MaxNonCompetitiveBoostAttributeImpl.java @@ -0,0 +1,78 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; + +/** Implementation class for {@link MaxNonCompetitiveBoostAttribute}. + * @lucene.internal + */ +public final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute { + private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; + private BytesRef competitiveTerm = null; + + public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) { + this.maxNonCompetitiveBoost = maxNonCompetitiveBoost; + } + + public float getMaxNonCompetitiveBoost() { + return maxNonCompetitiveBoost; + } + + public void setCompetitiveTerm(final BytesRef competitiveTerm) { + this.competitiveTerm = competitiveTerm; + } + + public BytesRef getCompetitiveTerm() { + return competitiveTerm; + } + + @Override + public void clear() { + maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; + competitiveTerm = null; + } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other instanceof MaxNonCompetitiveBoostAttributeImpl) { + final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other; + return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost) + && (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm)); + } + return false; + } + + @Override + public int hashCode() { + int hash = Float.floatToIntBits(maxNonCompetitiveBoost); + if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode(); + return hash; + } + + @Override + public void copyTo(AttributeImpl target) { + final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target; + t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); + t.setCompetitiveTerm(competitiveTerm); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java index efcdf461d8f..3c1bb82e24b 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -19,29 +19,12 @@ package org.apache.lucene.search; import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.PriorityQueue; -import java.util.Comparator; -import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Attribute; -import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.ByteBlockPool; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.ReaderUtil; -import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; /** * An abstract {@link Query} that matches documents @@ -80,156 +63,12 @@ public abstract class MultiTermQuery extends Query { protected final String field; protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; transient int numberOfTerms = 0; - - /** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum(IndexReader,AttributeSource)} - * and update the boost on each returned term. This enables to control the boost factor - * for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or - * {@link TopTermsBooleanQueryRewrite} mode. - * {@link FuzzyQuery} is using this to take the edit distance into account. - *

Please note: This attribute is intended to be added only by the TermsEnum - * to itsself in its constructor and consumed by the {@link RewriteMethod}. - * @lucene.internal - */ - public static interface BoostAttribute extends Attribute { - /** Sets the boost in this attribute */ - public void setBoost(float boost); - /** Retrieves the boost, default is {@code 1.0f}. */ - public float getBoost(); - } - - /** Implementation class for {@link BoostAttribute}. */ - public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute { - private float boost = 1.0f; - - public void setBoost(float boost) { - this.boost = boost; - } - - public float getBoost() { - return boost; - } - - @Override - public void clear() { - boost = 1.0f; - } - - @Override - public boolean equals(Object other) { - if (this == other) - return true; - if (other instanceof BoostAttributeImpl) - return ((BoostAttributeImpl) other).boost == boost; - return false; - } - - @Override - public int hashCode() { - return Float.floatToIntBits(boost); - } - - @Override - public void copyTo(AttributeImpl target) { - ((BoostAttribute) target).setBoost(boost); - } - } - - /** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling - * {@link #getTermsEnum(IndexReader,AttributeSource)}. - * {@link FuzzyQuery} is using this to control its internal behaviour - * to only return competitive terms. - *

Please note: This attribute is intended to be added by the {@link RewriteMethod} - * to an empty {@link AttributeSource} that is shared for all segments - * during query rewrite. This attribute source is passed to all segment enums - * on {@link #getTermsEnum(IndexReader,AttributeSource)}. - * {@link TopTermsBooleanQueryRewrite} uses this attribute to - * inform all enums about the current boost, that is not competitive. - * @lucene.internal - */ - public static interface MaxNonCompetitiveBoostAttribute extends Attribute { - /** This is the maximum boost that would not be competitive. */ - public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost); - /** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */ - public float getMaxNonCompetitiveBoost(); - /** This is the term or null of the term that triggered the boost change. */ - public void setCompetitiveTerm(BytesRef competitiveTerm); - /** This is the term or null of the term that triggered the boost change. Default is null, which means every term is competitoive. */ - public BytesRef getCompetitiveTerm(); - } - - /** Implementation class for {@link MaxNonCompetitiveBoostAttribute}. */ - public static final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute { - private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; - private BytesRef competitiveTerm = null; - - public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) { - this.maxNonCompetitiveBoost = maxNonCompetitiveBoost; - } - - public float getMaxNonCompetitiveBoost() { - return maxNonCompetitiveBoost; - } - - public void setCompetitiveTerm(final BytesRef competitiveTerm) { - this.competitiveTerm = competitiveTerm; - } - - public BytesRef getCompetitiveTerm() { - return competitiveTerm; - } - - @Override - public void clear() { - maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; - competitiveTerm = null; - } - - @Override - public boolean equals(Object other) { - if (this == other) - return true; - if (other instanceof MaxNonCompetitiveBoostAttributeImpl) { - final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other; - return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost) - && (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm)); - } - return false; - } - - @Override - public int hashCode() { - int hash = Float.floatToIntBits(maxNonCompetitiveBoost); - if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode(); - return hash; - } - - @Override - public void copyTo(AttributeImpl target) { - final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target; - t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); - t.setCompetitiveTerm(competitiveTerm); - } - } /** Abstract class that defines how the query is rewritten. */ public static abstract class RewriteMethod implements Serializable { public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException; } - private static final class ConstantScoreFilterRewrite extends RewriteMethod { - @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) { - Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); - result.setBoost(query.getBoost()); - return result; - } - - // Make sure we are still a singleton even after deserializing - protected Object readResolve() { - return CONSTANT_SCORE_FILTER_REWRITE; - } - } - /** A rewrite method that first creates a private Filter, * by visiting each term in sequence and marking all docs * for that term. Matching documents are assigned a @@ -242,162 +81,19 @@ public abstract class MultiTermQuery extends Query { * exception. * * @see #setRewriteMethod */ - public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite(); - - private abstract static class BooleanQueryRewrite extends RewriteMethod { - - protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - final List subReaders = new ArrayList(); - ReaderUtil.gatherSubReaders(subReaders, reader); - Comparator lastTermComp = null; - - for (IndexReader r : subReaders) { - final Fields fields = r.fields(); - if (fields == null) { - // reader has no fields - continue; - } - - final Terms terms = fields.terms(query.field); - if (terms == null) { - // field does not exist - continue; - } - - final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes); - assert termsEnum != null; - - if (termsEnum == TermsEnum.EMPTY) - continue; - - // Check comparator compatibility: - final Comparator newTermComp = termsEnum.getComparator(); - if (lastTermComp != null && newTermComp != lastTermComp) - throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp); - lastTermComp = newTermComp; - - collector.setNextEnum(termsEnum); - BytesRef bytes; - while ((bytes = termsEnum.next()) != null) { - termsEnum.cacheCurrentTerm(); - if (!collector.collect(bytes)) - return; // interrupt whole term collection, so also don't iterate other subReaders - } - } - } - - protected static abstract class TermCollector { - /** attributes used for communication with the enum */ - public final AttributeSource attributes = new AttributeSource(); - - /** return false to stop collecting */ - public abstract boolean collect(BytesRef bytes) throws IOException; - - /** the next segment's {@link TermsEnum} that is used to collect terms */ - public abstract void setNextEnum(TermsEnum termsEnum) throws IOException; - } - } - - private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite { + public static final RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new RewriteMethod() { @Override - public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { - final ParallelArraysTermCollector col = new ParallelArraysTermCollector(); - collectTerms(reader, query, col); - - final Term placeholderTerm = new Term(query.field); - final BooleanQuery result = new BooleanQuery(true); - final int size = col.terms.size(); - if (size > 0) { - final int sort[] = col.terms.sort(col.termsEnum.getComparator()); - final int[] docFreq = col.array.docFreq; - final float[] boost = col.array.boost; - for (int i = 0; i < size; i++) { - final int pos = sort[i]; - final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef())); - assert reader.docFreq(term) == docFreq[pos]; - final TermQuery tq = new TermQuery(term, docFreq[pos]); - tq.setBoost(query.getBoost() * boost[pos]); - result.add(tq, BooleanClause.Occur.SHOULD); - } - } - query.incTotalNumberOfTerms(size); + public Query rewrite(IndexReader reader, MultiTermQuery query) { + Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); + result.setBoost(query.getBoost()); return result; } // Make sure we are still a singleton even after deserializing protected Object readResolve() { - return SCORING_BOOLEAN_QUERY_REWRITE; + return CONSTANT_SCORE_FILTER_REWRITE; } - - static final class ParallelArraysTermCollector extends TermCollector { - final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16); - final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array); - TermsEnum termsEnum; - - private BoostAttribute boostAtt; - - @Override - public void setNextEnum(TermsEnum termsEnum) throws IOException { - this.termsEnum = termsEnum; - this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); - } - - @Override - public boolean collect(BytesRef bytes) { - final int e = terms.add(bytes); - if (e < 0 ) { - // duplicate term: update docFreq - final int pos = (-e)-1; - array.docFreq[pos] += termsEnum.docFreq(); - assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; - } else { - // new entry: we populate the entry initially - array.docFreq[e] = termsEnum.docFreq(); - array.boost[e] = boostAtt.getBoost(); - } - // if the new entry reaches the max clause count, we exit early - if (e >= BooleanQuery.getMaxClauseCount()) - throw new BooleanQuery.TooManyClauses(); - return true; - } - } - - /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ - static final class TermFreqBoostByteStart extends DirectBytesStartArray { - int[] docFreq; - float[] boost; - - public TermFreqBoostByteStart(int initSize) { - super(initSize); - } - - @Override - public int[] init() { - final int[] ord = super.init(); - boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)]; - docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)]; - assert boost.length >= ord.length && docFreq.length >= ord.length; - return ord; - } - - @Override - public int[] grow() { - final int[] ord = super.grow(); - docFreq = ArrayUtil.grow(docFreq, ord.length); - boost = ArrayUtil.grow(boost, ord.length); - assert boost.length >= ord.length && docFreq.length >= ord.length; - return ord; - } - - @Override - public int[] clear() { - boost = null; - docFreq = null; - return super.clear(); - } - - } - } + }; /** A rewrite method that first translates each term into * {@link BooleanClause.Occur#SHOULD} clause in a @@ -412,155 +108,19 @@ public abstract class MultiTermQuery extends Query { * exceeds {@link BooleanQuery#getMaxClauseCount}. * * @see #setRewriteMethod */ - public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite(); - - - /** - * Base rewrite method for collecting only the top terms - * via a priority queue. - */ - public static abstract class TopTermsBooleanQueryRewrite extends BooleanQueryRewrite { - private final int size; - - /** - * Create a TopTermsBooleanQueryRewrite for - * at most size terms. - *

- * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than - * size, then it will be used instead. - */ - public TopTermsBooleanQueryRewrite(int size) { - this.size = size; - } - - /** Return a suitable Query for a MultiTermQuery term. */ - protected abstract Query getQuery(Term term, int docCount); - - @Override - public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { - final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); - final PriorityQueue stQueue = new PriorityQueue(); - collectTerms(reader, query, new TermCollector() { - private final MaxNonCompetitiveBoostAttribute maxBoostAtt = - attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class); - - private final Map visitedTerms = new HashMap(); - - private TermsEnum termsEnum; - private Comparator termComp; - private BoostAttribute boostAtt; - private ScoreTerm st; - - @Override - public void setNextEnum(TermsEnum termsEnum) throws IOException { - this.termsEnum = termsEnum; - this.termComp = termsEnum.getComparator(); - // lazy init the initial ScoreTerm because comparator is not known on ctor: - if (st == null) - st = new ScoreTerm(this.termComp); - boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); - } - - @Override - public boolean collect(BytesRef bytes) { - final float boost = boostAtt.getBoost(); - // ignore uncompetetive hits - if (stQueue.size() == maxSize) { - final ScoreTerm t = stQueue.peek(); - if (boost < t.boost) - return true; - if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) - return true; - } - ScoreTerm t = visitedTerms.get(bytes); - if (t != null) { - // if the term is already in the PQ, only update docFreq of term in PQ - t.docFreq += termsEnum.docFreq(); - assert t.boost == boost : "boost should be equal in all segment TermsEnums"; - } else { - // add new entry in PQ, we must clone the term, else it may get overwritten! - st.bytes.copy(bytes); - st.boost = boost; - st.docFreq = termsEnum.docFreq(); - visitedTerms.put(st.bytes, st); - stQueue.offer(st); - // possibly drop entries from queue - if (stQueue.size() > maxSize) { - st = stQueue.poll(); - visitedTerms.remove(st.bytes); - } else { - st = new ScoreTerm(termComp); - } - assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; - // set maxBoostAtt with values to help FuzzyTermsEnum to optimize - if (stQueue.size() == maxSize) { - t = stQueue.peek(); - maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); - maxBoostAtt.setCompetitiveTerm(t.bytes); - } - } - return true; - } - }); - - final Term placeholderTerm = new Term(query.field); - final BooleanQuery bq = new BooleanQuery(true); - final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); - ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp); - for (final ScoreTerm st : scoreTerms) { - final Term term = placeholderTerm.createTerm(st.bytes); - assert reader.docFreq(term) == st.docFreq; - Query tq = getQuery(term, st.docFreq); - tq.setBoost(query.getBoost() * st.boost); // set the boost - bq.add(tq, BooleanClause.Occur.SHOULD); // add to query - } - query.incTotalNumberOfTerms(scoreTerms.length); - return bq; - } + public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = ScoringRewrite.SCORING_BOOLEAN_QUERY_REWRITE; - @Override - public int hashCode() { - return 31 * size; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; - TopTermsBooleanQueryRewrite other = (TopTermsBooleanQueryRewrite) obj; - if (size != other.size) return false; - return true; - } - - private static final Comparator scoreTermSortByTermComp = - new Comparator() { - public int compare(ScoreTerm st1, ScoreTerm st2) { - assert st1.termComp == st2.termComp : - "term comparator should not change between segments"; - return st1.termComp.compare(st1.bytes, st2.bytes); - } - }; - - static final class ScoreTerm implements Comparable { - public final Comparator termComp; - - public final BytesRef bytes = new BytesRef(); - public float boost; - public int docFreq; - - public ScoreTerm(Comparator termComp) { - this.termComp = termComp; - } - - public int compareTo(ScoreTerm other) { - if (this.boost == other.boost) - return termComp.compare(other.bytes, this.bytes); - else - return Float.compare(this.boost, other.boost); - } - } - } + /** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except + * scores are not computed. Instead, each matching + * document receives a constant score equal to the + * query's boost. + * + *

NOTE: This rewrite method will hit {@link + * BooleanQuery.TooManyClauses} if the number of terms + * exceeds {@link BooleanQuery#getMaxClauseCount}. + * + * @see #setRewriteMethod */ + public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = ScoringRewrite.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE; /** * A rewrite method that first translates each term into @@ -574,8 +134,7 @@ public abstract class MultiTermQuery extends Query { * * @see #setRewriteMethod */ - public static final class TopTermsScoringBooleanQueryRewrite extends - TopTermsBooleanQueryRewrite { + public static final class TopTermsScoringBooleanQueryRewrite extends TopTermsRewrite { /** * Create a TopTermsScoringBooleanQueryRewrite for @@ -589,8 +148,20 @@ public abstract class MultiTermQuery extends Query { } @Override - protected Query getQuery(Term term, int docFreq) { - return new TermQuery(term, docFreq); + protected int getMaxSize() { + return BooleanQuery.getMaxClauseCount(); + } + + @Override + protected BooleanQuery getTopLevelQuery() { + return new BooleanQuery(true); + } + + @Override + protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) { + final TermQuery tq = new TermQuery(term, docCount); + tq.setBoost(boost); + topLevel.add(tq, BooleanClause.Occur.SHOULD); } } @@ -604,8 +175,7 @@ public abstract class MultiTermQuery extends Query { * * @see #setRewriteMethod */ - public static final class TopTermsBoostOnlyBooleanQueryRewrite extends - TopTermsBooleanQueryRewrite { + public static final class TopTermsBoostOnlyBooleanQueryRewrite extends TopTermsRewrite { /** * Create a TopTermsBoostOnlyBooleanQueryRewrite for @@ -619,45 +189,23 @@ public abstract class MultiTermQuery extends Query { } @Override - protected Query getQuery(Term term, int docFreq) { - return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq))); + protected int getMaxSize() { + return BooleanQuery.getMaxClauseCount(); + } + + @Override + protected BooleanQuery getTopLevelQuery() { + return new BooleanQuery(true); + } + + @Override + protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost) { + final Query q = new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq))); + q.setBoost(boost); + topLevel.add(q, BooleanClause.Occur.SHOULD); } } - - private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable { - @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { - Query result = super.rewrite(reader, query); - assert result instanceof BooleanQuery; - // TODO: if empty boolean query return NullQuery? - if (!((BooleanQuery) result).clauses().isEmpty()) { - // strip the scores off - result = new ConstantScoreQuery(new QueryWrapperFilter(result)); - result.setBoost(query.getBoost()); - } - return result; - } - - // Make sure we are still a singleton even after deserializing - @Override - protected Object readResolve() { - return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE; - } - } - - /** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except - * scores are not computed. Instead, each matching - * document receives a constant score equal to the - * query's boost. - * - *

NOTE: This rewrite method will hit {@link - * BooleanQuery.TooManyClauses} if the number of terms - * exceeds {@link BooleanQuery#getMaxClauseCount}. - * - * @see #setRewriteMethod */ - public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite(); - - + /** A rewrite method that tries to pick the best * constant-score rewrite method based on term and * document counts from the query. If both the number of @@ -666,140 +214,7 @@ public abstract class MultiTermQuery extends Query { * Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is * used. */ - public static class ConstantScoreAutoRewrite extends BooleanQueryRewrite { - - // Defaults derived from rough tests with a 20.0 million - // doc Wikipedia index. With more than 350 terms in the - // query, the filter method is fastest: - public static int DEFAULT_TERM_COUNT_CUTOFF = 350; - - // If the query will hit more than 1 in 1000 of the docs - // in the index (0.1%), the filter method is fastest: - public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; - - private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; - private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT; - - /** If the number of terms in this query is equal to or - * larger than this setting then {@link - * #CONSTANT_SCORE_FILTER_REWRITE} is used. */ - public void setTermCountCutoff(int count) { - termCountCutoff = count; - } - - /** @see #setTermCountCutoff */ - public int getTermCountCutoff() { - return termCountCutoff; - } - - /** If the number of documents to be visited in the - * postings exceeds this specified percentage of the - * maxDoc() for the index, then {@link - * #CONSTANT_SCORE_FILTER_REWRITE} is used. - * @param percent 0.0 to 100.0 */ - public void setDocCountPercent(double percent) { - docCountPercent = percent; - } - - /** @see #setDocCountPercent */ - public double getDocCountPercent() { - return docCountPercent; - } - - @Override - public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { - - // Get the enum and start visiting terms. If we - // exhaust the enum before hitting either of the - // cutoffs, we use ConstantBooleanQueryRewrite; else, - // ConstantFilterRewrite: - final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); - final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - - final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); - collectTerms(reader, query, col); - final int size = col.pendingTerms.size(); - if (col.hasCutOff) { - return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); - } else if (size == 0) { - return new BooleanQuery(true); - } else { - final BooleanQuery bq = new BooleanQuery(true); - final Term placeholderTerm = new Term(query.field); - final BytesRefHash pendingTerms = col.pendingTerms; - final int sort[] = pendingTerms.sort(col.termsEnum.getComparator()); - for(int i = 0; i < size; i++) { - // docFreq is not used for constant score here, we pass 1 - // to explicitely set a fake value, so it's not calculated - bq.add(new TermQuery( - placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1 - ), BooleanClause.Occur.SHOULD); - } - // Strip scores - final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); - query.incTotalNumberOfTerms(size); - return result; - } - } - - static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(int docCountCutoff, int termCountLimit) { - this.docCountCutoff = docCountCutoff; - this.termCountLimit = termCountLimit; - } - - @Override - public void setNextEnum(TermsEnum termsEnum) throws IOException { - this.termsEnum = termsEnum; - } - - @Override - public boolean collect(BytesRef bytes) throws IOException { - pendingTerms.add(bytes); - docVisitCount += termsEnum.docFreq(); - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { - hasCutOff = true; - return false; - } - return true; - } - - int docVisitCount = 0; - boolean hasCutOff = false; - TermsEnum termsEnum; - - final int docCountCutoff, termCountLimit; - final BytesRefHash pendingTerms = new BytesRefHash(); - } - - @Override - public int hashCode() { - final int prime = 1279; - return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent)); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; - if (other.termCountCutoff != termCountCutoff) { - return false; - } - - if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) { - return false; - } - - return true; - } - } + public static class ConstantScoreAutoRewrite extends org.apache.lucene.search.ConstantScoreAutoRewrite {} /** Read-only default instance of {@link * ConstantScoreAutoRewrite}, with {@link @@ -851,7 +266,7 @@ public abstract class MultiTermQuery extends Query { * positioned to the first matching term. * The given {@link AttributeSource} is passed by the {@link RewriteMethod} to * provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts. - * This is currently only used by {@link TopTermsBooleanQueryRewrite} + * This is currently only used by {@link TopTermsRewrite} */ protected abstract TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException; diff --git a/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java new file mode 100644 index 00000000000..bac393fe70d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java @@ -0,0 +1,203 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Comparator; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.MultiTermQuery.RewriteMethod; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; + +/** @lucene.internal Only public to be accessible by spans package. */ +public abstract class ScoringRewrite extends TermCollectingRewrite { + + /** A rewrite method that first translates each term into + * {@link BooleanClause.Occur#SHOULD} clause in a + * BooleanQuery, and keeps the scores as computed by the + * query. Note that typically such scores are + * meaningless to the user, and require non-trivial CPU + * to compute, so it's almost always better to use {@link + * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. + * + *

NOTE: This rewrite method will hit {@link + * BooleanQuery.TooManyClauses} if the number of terms + * exceeds {@link BooleanQuery#getMaxClauseCount}. + * + * @see #setRewriteMethod */ + public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringRewrite() { + @Override + protected BooleanQuery getTopLevelQuery() { + return new BooleanQuery(true); + } + + @Override + protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) { + final TermQuery tq = new TermQuery(term, docCount); + tq.setBoost(boost); + topLevel.add(tq, BooleanClause.Occur.SHOULD); + } + + // Make sure we are still a singleton even after deserializing + protected Object readResolve() { + return SCORING_BOOLEAN_QUERY_REWRITE; + } + }; + + /** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except + * scores are not computed. Instead, each matching + * document receives a constant score equal to the + * query's boost. + * + *

NOTE: This rewrite method will hit {@link + * BooleanQuery.TooManyClauses} if the number of terms + * exceeds {@link BooleanQuery#getMaxClauseCount}. + * + * @see #setRewriteMethod */ + public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new RewriteMethod() { + @Override + public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + Query result = SCORING_BOOLEAN_QUERY_REWRITE.rewrite(reader, query); + assert result instanceof BooleanQuery; + // TODO: if empty boolean query return NullQuery? + if (!((BooleanQuery) result).clauses().isEmpty()) { + // strip the scores off + result = new ConstantScoreQuery(new QueryWrapperFilter(result)); + result.setBoost(query.getBoost()); + } + return result; + } + + // Make sure we are still a singleton even after deserializing + protected Object readResolve() { + return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE; + } + }; + + @Override + public final Q rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { + final Q result = getTopLevelQuery(); + final ParallelArraysTermCollector col = new ParallelArraysTermCollector(result instanceof BooleanQuery); + collectTerms(reader, query, col); + + final Term placeholderTerm = new Term(query.field); + final int size = col.terms.size(); + if (size > 0) { + final int sort[] = col.terms.sort(col.termsEnum.getComparator()); + final int[] docFreq = col.array.docFreq; + final float[] boost = col.array.boost; + for (int i = 0; i < size; i++) { + final int pos = sort[i]; + final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef())); + assert reader.docFreq(term) == docFreq[pos]; + addClause(result, term, docFreq[pos], query.getBoost() * boost[pos]); + } + } + query.incTotalNumberOfTerms(size); + return result; + } + + static final class ParallelArraysTermCollector extends TermCollector { + private final boolean checkMaxClauseCount; + final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16); + final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array); + TermsEnum termsEnum; + + private BoostAttribute boostAtt; + + public ParallelArraysTermCollector(boolean checkMaxClauseCount) { + this.checkMaxClauseCount = checkMaxClauseCount; + } + + @Override + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + } + + @Override + public boolean collect(BytesRef bytes) { + final int e = terms.add(bytes); + if (e < 0 ) { + // duplicate term: update docFreq + final int pos = (-e)-1; + array.docFreq[pos] += termsEnum.docFreq(); + assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; + } else { + // new entry: we populate the entry initially + array.docFreq[e] = termsEnum.docFreq(); + array.boost[e] = boostAtt.getBoost(); + } + // if the new entry reaches the max clause count, we exit early + if (checkMaxClauseCount && e >= BooleanQuery.getMaxClauseCount()) + throw new BooleanQuery.TooManyClauses(); + return true; + } + } + + /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ + static final class TermFreqBoostByteStart extends DirectBytesStartArray { + int[] docFreq; + float[] boost; + + public TermFreqBoostByteStart(int initSize) { + super(initSize); + } + + @Override + public int[] init() { + final int[] ord = super.init(); + boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)]; + docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)]; + assert boost.length >= ord.length && docFreq.length >= ord.length; + return ord; + } + + @Override + public int[] grow() { + final int[] ord = super.grow(); + docFreq = ArrayUtil.grow(docFreq, ord.length); + boost = ArrayUtil.grow(boost, ord.length); + assert boost.length >= ord.length && docFreq.length >= ord.length; + return ord; + } + + @Override + public int[] clear() { + boost = null; + docFreq = null; + return super.clear(); + } + + } +} diff --git a/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java b/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java new file mode 100644 index 00000000000..d186859af8a --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java @@ -0,0 +1,93 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Comparator; + +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; + +abstract class TermCollectingRewrite extends MultiTermQuery.RewriteMethod { + + /** Return a suitable top-level Query for holding all expanded terms. */ + protected abstract Q getTopLevelQuery() throws IOException; + + /** Add a MultiTermQuery term to the top-level query */ + protected abstract void addClause(Q topLevel, Term term, int docCount, float boost) throws IOException; + + protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { + final List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, reader); + Comparator lastTermComp = null; + + for (IndexReader r : subReaders) { + final Fields fields = r.fields(); + if (fields == null) { + // reader has no fields + continue; + } + + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + continue; + } + + final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes); + assert termsEnum != null; + + if (termsEnum == TermsEnum.EMPTY) + continue; + + // Check comparator compatibility: + final Comparator newTermComp = termsEnum.getComparator(); + if (lastTermComp != null && newTermComp != lastTermComp) + throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp); + lastTermComp = newTermComp; + + collector.setNextEnum(termsEnum); + BytesRef bytes; + while ((bytes = termsEnum.next()) != null) { + termsEnum.cacheCurrentTerm(); + if (!collector.collect(bytes)) + return; // interrupt whole term collection, so also don't iterate other subReaders + } + } + } + + protected static abstract class TermCollector { + /** attributes used for communication with the enum */ + public final AttributeSource attributes = new AttributeSource(); + + /** return false to stop collecting */ + public abstract boolean collect(BytesRef bytes) throws IOException; + + /** the next segment's {@link TermsEnum} that is used to collect terms */ + public abstract void setNextEnum(TermsEnum termsEnum) throws IOException; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java new file mode 100644 index 00000000000..b3e409a8df7 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -0,0 +1,182 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Comparator; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/** + * Base rewrite method for collecting only the top terms + * via a priority queue. + * @lucene.internal Only public to be accessible by spans package. + */ +public abstract class TopTermsRewrite extends TermCollectingRewrite { + + private final int size; + + /** + * Create a TopTermsBooleanQueryRewrite for + * at most size terms. + *

+ * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than + * size, then it will be used instead. + */ + public TopTermsRewrite(int size) { + this.size = size; + } + + /** return the maximum priority queue size */ + public int getSize() { + return size; + } + + /** return the maximum size of the priority queue (for boolean rewrites this is BooleanQuery#getMaxClauseCount). */ + protected abstract int getMaxSize(); + + @Override + public final Q rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { + final int maxSize = Math.min(size, getMaxSize()); + final PriorityQueue stQueue = new PriorityQueue(); + collectTerms(reader, query, new TermCollector() { + private final MaxNonCompetitiveBoostAttribute maxBoostAtt = + attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class); + + private final Map visitedTerms = new HashMap(); + + private TermsEnum termsEnum; + private Comparator termComp; + private BoostAttribute boostAtt; + private ScoreTerm st; + + @Override + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + this.termComp = termsEnum.getComparator(); + // lazy init the initial ScoreTerm because comparator is not known on ctor: + if (st == null) + st = new ScoreTerm(this.termComp); + boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + } + + @Override + public boolean collect(BytesRef bytes) { + final float boost = boostAtt.getBoost(); + // ignore uncompetetive hits + if (stQueue.size() == maxSize) { + final ScoreTerm t = stQueue.peek(); + if (boost < t.boost) + return true; + if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) + return true; + } + ScoreTerm t = visitedTerms.get(bytes); + if (t != null) { + // if the term is already in the PQ, only update docFreq of term in PQ + t.docFreq += termsEnum.docFreq(); + assert t.boost == boost : "boost should be equal in all segment TermsEnums"; + } else { + // add new entry in PQ, we must clone the term, else it may get overwritten! + st.bytes.copy(bytes); + st.boost = boost; + st.docFreq = termsEnum.docFreq(); + visitedTerms.put(st.bytes, st); + stQueue.offer(st); + // possibly drop entries from queue + if (stQueue.size() > maxSize) { + st = stQueue.poll(); + visitedTerms.remove(st.bytes); + } else { + st = new ScoreTerm(termComp); + } + assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; + // set maxBoostAtt with values to help FuzzyTermsEnum to optimize + if (stQueue.size() == maxSize) { + t = stQueue.peek(); + maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); + maxBoostAtt.setCompetitiveTerm(t.bytes); + } + } + return true; + } + }); + + final Term placeholderTerm = new Term(query.field); + final Q q = getTopLevelQuery(); + final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); + ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp); + for (final ScoreTerm st : scoreTerms) { + final Term term = placeholderTerm.createTerm(st.bytes); + assert reader.docFreq(term) == st.docFreq; + addClause(q, term, st.docFreq, query.getBoost() * st.boost); // add to query + } + query.incTotalNumberOfTerms(scoreTerms.length); + return q; + } + + @Override + public int hashCode() { + return 31 * size; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + final TopTermsRewrite other = (TopTermsRewrite) obj; + if (size != other.size) return false; + return true; + } + + private static final Comparator scoreTermSortByTermComp = + new Comparator() { + public int compare(ScoreTerm st1, ScoreTerm st2) { + assert st1.termComp == st2.termComp : + "term comparator should not change between segments"; + return st1.termComp.compare(st1.bytes, st2.bytes); + } + }; + + static final class ScoreTerm implements Comparable { + public final Comparator termComp; + + public final BytesRef bytes = new BytesRef(); + public float boost; + public int docFreq; + + public ScoreTerm(Comparator termComp) { + this.termComp = termComp; + } + + public int compareTo(ScoreTerm other) { + if (this.boost == other.boost) + return termComp.compare(other.bytes, this.bytes); + else + return Float.compare(this.boost, other.boost); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java b/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java new file mode 100644 index 00000000000..1b97823db21 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java @@ -0,0 +1,234 @@ +package org.apache.lucene.search.spans; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopTermsRewrite; +import org.apache.lucene.search.ScoringRewrite; +import org.apache.lucene.search.BooleanClause.Occur; // javadocs only + +/** + * Wraps any {@link MultiTermQuery} as a {@link SpanQuery}, + * so it can be nested within other SpanQuery classes. + *

+ * The query is rewritten by default to a {@link SpanOrQuery} containing + * the expanded terms, but this can be customized. + *

+ * Example: + *

+ * {@code
+ * WildcardQuery wildcard = new WildcardQuery(new Term("field", "bro?n"));
+ * SpanQuery spanWildcard = new SpanMultiTermQueryWrapper(wildcard);
+ * // do something with spanWildcard, such as use it in a SpanFirstQuery
+ * }
+ * 
+ */ +public class SpanMultiTermQueryWrapper extends SpanQuery { + protected final Q query; + + /** + * Create a new SpanMultiTermQueryWrapper. + * + * @param query Query to wrap. + *

+ * NOTE: This will call {@link MultiTermQuery#setRewriteMethod(MultiTermQuery.RewriteMethod)} + * on the wrapped query, changing its rewrite method to a suitable one for spans. + * Be sure to not change the rewrite method on the wrapped query afterwards! Doing so will + * throw {@link UnsupportedOperationException} on rewriting this query! + */ + public SpanMultiTermQueryWrapper(Q query) { + this.query = query; + + MultiTermQuery.RewriteMethod method = query.getRewriteMethod(); + if (method instanceof TopTermsRewrite) { + final int pqsize = ((TopTermsRewrite) method).getSize(); + setRewriteMethod(new TopTermsSpanBooleanQueryRewrite(pqsize)); + } else { + setRewriteMethod(SCORING_SPAN_QUERY_REWRITE); + } + } + + /** + * Expert: returns the rewriteMethod + */ + public final SpanRewriteMethod getRewriteMethod() { + final MultiTermQuery.RewriteMethod m = query.getRewriteMethod(); + if (!(m instanceof SpanRewriteMethod)) + throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod."); + return (SpanRewriteMethod) m; + } + + /** + * Expert: sets the rewrite method. This only makes sense + * to be a span rewrite method. + */ + public final void setRewriteMethod(SpanRewriteMethod rewriteMethod) { + query.setRewriteMethod(rewriteMethod); + } + + @Override + public Spans getSpans(IndexReader reader) throws IOException { + throw new UnsupportedOperationException("Query should have been rewritten"); + } + + @Override + public String getField() { + return query.getField(); + } + + @Override + public String toString(String field) { + StringBuilder builder = new StringBuilder(); + builder.append("SpanMultiTermQueryWrapper("); + builder.append(query.toString(field)); + builder.append(")"); + return builder.toString(); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + final Query q = query.rewrite(reader); + if (!(q instanceof SpanQuery)) + throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod."); + return q; + } + + @Override + public int hashCode() { + return 31 * query.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + final SpanMultiTermQueryWrapper other = (SpanMultiTermQueryWrapper) obj; + return query.equals(other.query); + } + + /** Abstract class that defines how the query is rewritten. */ + public static abstract class SpanRewriteMethod extends MultiTermQuery.RewriteMethod { + @Override + public abstract SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException; + } + + /** + * A rewrite method that first translates each term into a SpanTermQuery in a + * {@link Occur#SHOULD} clause in a BooleanQuery, and keeps the + * scores as computed by the query. + * + * @see #setRewriteMethod + */ + public final static SpanRewriteMethod SCORING_SPAN_QUERY_REWRITE = new SpanRewriteMethod() { + private final ScoringRewrite delegate = new ScoringRewrite() { + @Override + protected SpanOrQuery getTopLevelQuery() { + return new SpanOrQuery(); + } + + @Override + protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost) { + final SpanTermQuery q = new SpanTermQuery(term); + q.setBoost(boost); + topLevel.addClause(q); + } + }; + + @Override + public SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + return delegate.rewrite(reader, query); + } + + // Make sure we are still a singleton even after deserializing + protected Object readResolve() { + return SCORING_SPAN_QUERY_REWRITE; + } + }; + + /** + * A rewrite method that first translates each term into a SpanTermQuery in a + * {@link Occur#SHOULD} clause in a BooleanQuery, and keeps the + * scores as computed by the query. + * + *

+ * This rewrite method only uses the top scoring terms so it will not overflow + * the boolean max clause count. + * + * @see #setRewriteMethod + */ + public static final class TopTermsSpanBooleanQueryRewrite extends SpanRewriteMethod { + private final TopTermsRewrite delegate; + + /** + * Create a TopTermsSpanBooleanQueryRewrite for + * at most size terms. + */ + public TopTermsSpanBooleanQueryRewrite(int size) { + delegate = new TopTermsRewrite(size) { + @Override + protected int getMaxSize() { + return Integer.MAX_VALUE; + } + + @Override + protected SpanOrQuery getTopLevelQuery() { + return new SpanOrQuery(); + } + + @Override + protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost) { + final SpanTermQuery q = new SpanTermQuery(term); + q.setBoost(boost); + topLevel.addClause(q); + } + }; + } + + /** return the maximum priority queue size */ + public int getSize() { + return delegate.getSize(); + } + + @Override + public SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + return delegate.rewrite(reader, query); + } + + @Override + public int hashCode() { + return 31 * delegate.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + final TopTermsSpanBooleanQueryRewrite other = (TopTermsSpanBooleanQueryRewrite) obj; + return delegate.equals(other.delegate); + } + + } + +} diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java index 2951dd1f736..174304fd9dc 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanOrQuery.java @@ -42,16 +42,20 @@ public class SpanOrQuery extends SpanQuery implements Cloneable { // copy clauses array into an ArrayList this.clauses = new ArrayList(clauses.length); for (int i = 0; i < clauses.length; i++) { - SpanQuery clause = clauses[i]; - if (i == 0) { // check field - field = clause.getField(); - } else if (!clause.getField().equals(field)) { - throw new IllegalArgumentException("Clauses must have same field."); - } - this.clauses.add(clause); + addClause(clauses[i]); } } + /** Adds a clause to this query */ + public final void addClause(SpanQuery clause) { + if (field == null) { + field = clause.getField(); + } else if (!clause.getField().equals(field)) { + throw new IllegalArgumentException("Clauses must have same field."); + } + this.clauses.add(clause); + } + /** Return the clauses whose spans are matched. */ public SpanQuery[] getClauses() { return clauses.toArray(new SpanQuery[clauses.size()]); diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java b/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java index 8b9f5c94d06..13f1f640195 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java @@ -147,8 +147,8 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase { @Override protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException { return new TermRangeTermsEnum(reader, field, "2", "7", true, true, null) { - final MultiTermQuery.BoostAttribute boostAtt = - attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + final BoostAttribute boostAtt = + attributes().addAttribute(BoostAttribute.class); @Override protected AcceptStatus accept(BytesRef term) { diff --git a/lucene/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java b/lucene/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java new file mode 100644 index 00000000000..49437b4953d --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java @@ -0,0 +1,92 @@ +package org.apache.lucene.search.spans; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Tests for {@link SpanMultiTermQueryWrapper}, wrapping a few MultiTermQueries. + */ +public class TestSpanMultiTermQueryWrapper extends LuceneTestCase { + private Directory directory; + private IndexReader reader; + private Searcher searcher; + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, directory); + Document doc = new Document(); + Field field = newField("field", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + + field.setValue("quick brown fox"); + iw.addDocument(doc); + field.setValue("jumps over lazy broun dog"); + iw.addDocument(doc); + field.setValue("jumps over extremely very lazy broxn dog"); + iw.addDocument(doc); + reader = iw.getReader(); + iw.close(); + searcher = new IndexSearcher(reader); + } + + @Override + public void tearDown() throws Exception { + searcher.close(); + reader.close(); + directory.close(); + super.tearDown(); + } + + public void testWildcard() throws Exception { + WildcardQuery wq = new WildcardQuery(new Term("field", "bro?n")); + SpanQuery swq = new SpanMultiTermQueryWrapper(wq); + // will only match quick brown fox + SpanFirstQuery sfq = new SpanFirstQuery(swq, 2); + assertEquals(1, searcher.search(sfq, 10).totalHits); + } + + public void testFuzzy() throws Exception { + FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan")); + SpanQuery sfq = new SpanMultiTermQueryWrapper(fq); + // will not match quick brown fox + SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 3, 6); + assertEquals(2, searcher.search(sprq, 10).totalHits); + } + + public void testFuzzy2() throws Exception { + // maximum of 1 term expansion + FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1f, 0, 1); + SpanQuery sfq = new SpanMultiTermQueryWrapper(fq); + // will only match jumps over lazy broun dog + SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100); + assertEquals(1, searcher.search(sprq, 10).totalHits); + } +}