mirror of
https://github.com/apache/lucene.git
synced 2025-02-17 23:45:09 +00:00
LUCENE-2754, LUCENE-2757: Added a wrapper around MultiTermQueries to add span support: SpanMultiTermQueryWrapper<Q extends MultiTermQuery>. Using this wrapper its easy to add fuzzy/wildcard to e.g. a SpanNearQuery. This patch also refactors all RewriteMethods and Attributes in MTQ (class was unmaintainable).
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1035096 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
18c317a1e6
commit
ac71ebc237
@ -716,6 +716,10 @@ New features
|
||||
* LUCENE-2671: Add SortField.setMissingValue( v ) to enable sorting
|
||||
behavior for documents that do not include the given field. (ryan)
|
||||
|
||||
* LUCENE-2754, LUCENE-2757: Added a wrapper around MultiTermQueries
|
||||
to add span support: SpanMultiTermQueryWrapper<Q extends MultiTermQuery>.
|
||||
Using this wrapper its easy to add fuzzy/wildcard to e.g. a SpanNearQuery.
|
||||
(Robert Muir, Uwe Schindler)
|
||||
|
||||
Optimizations
|
||||
|
||||
|
@ -177,6 +177,10 @@ API Changes
|
||||
QueryNodeProcessorPipeline now implements the List interface, this is useful
|
||||
if you want to extend or modify an existing pipeline. (Adriano Crestani via Robert Muir)
|
||||
|
||||
* LUCENE-2754, LUCENE-2757: Deprecated SpanRegexQuery. Use
|
||||
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
|
||||
(Robert Muir, Uwe Schindler)
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
|
||||
|
@ -201,16 +201,16 @@ public class FuzzyLikeThisQuery extends Query
|
||||
float minScore=0;
|
||||
Term startTerm=internSavingTemplateTerm.createTerm(term);
|
||||
AttributeSource atts = new AttributeSource();
|
||||
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
|
||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||
FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, atts, startTerm, f.minSimilarity, f.prefixLength);
|
||||
//store the df so all variants use same idf
|
||||
int df = reader.docFreq(startTerm);
|
||||
int numVariants=0;
|
||||
int totalVariantDocFreqs=0;
|
||||
BytesRef possibleMatch;
|
||||
MultiTermQuery.BoostAttribute boostAtt =
|
||||
fe.attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
BoostAttribute boostAtt =
|
||||
fe.attributes().addAttribute(BoostAttribute.class);
|
||||
while ((possibleMatch = fe.next()) != null) {
|
||||
if (possibleMatch!=null) {
|
||||
numVariants++;
|
||||
|
@ -18,115 +18,29 @@ package org.apache.lucene.search.regex;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.ArrayList;
|
||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||
|
||||
/**
|
||||
* A SpanQuery version of {@link RegexQuery} allowing regular expression
|
||||
* queries to be nested within other SpanQuery subclasses.
|
||||
* @deprecated Use <code>new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery())</code> instead.
|
||||
* This query will be removed in Lucene 4.0
|
||||
*/
|
||||
public class SpanRegexQuery extends SpanQuery implements RegexQueryCapable {
|
||||
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||
private Term term;
|
||||
@Deprecated
|
||||
public class SpanRegexQuery extends SpanMultiTermQueryWrapper<RegexQuery> implements RegexQueryCapable {
|
||||
private final RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||
|
||||
public SpanRegexQuery(Term term) {
|
||||
this.term = term;
|
||||
super(new RegexQuery(term));
|
||||
}
|
||||
|
||||
public Term getTerm() { return term; }
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
RegexQuery orig = new RegexQuery(term);
|
||||
orig.setRegexImplementation(regexImpl);
|
||||
orig.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
|
||||
BooleanQuery bq = (BooleanQuery) orig.rewrite(reader);
|
||||
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
SpanQuery[] sqs = new SpanQuery[clauses.length];
|
||||
for (int i = 0; i < clauses.length; i++) {
|
||||
BooleanClause clause = clauses[i];
|
||||
|
||||
// Clauses from RegexQuery.rewrite are always TermQuery's
|
||||
TermQuery tq = (TermQuery) clause.getQuery();
|
||||
|
||||
sqs[i] = new SpanTermQuery(tq.getTerm());
|
||||
sqs[i].setBoost(tq.getBoost());
|
||||
}
|
||||
|
||||
SpanOrQuery query = new SpanOrQuery(sqs);
|
||||
query.setBoost(orig.getBoost());
|
||||
|
||||
return query;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Spans getSpans(IndexReader reader) throws IOException {
|
||||
throw new UnsupportedOperationException("Query should have been rewritten");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getField() {
|
||||
return term.field();
|
||||
}
|
||||
|
||||
public Collection<Term> getTerms() {
|
||||
Collection<Term> terms = new ArrayList<Term>();
|
||||
terms.add(term);
|
||||
return terms;
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
final SpanRegexQuery that = (SpanRegexQuery) o;
|
||||
|
||||
if (!regexImpl.equals(that.regexImpl)) return false;
|
||||
if (!term.equals(that.term)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* generated by IntelliJ IDEA */
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result;
|
||||
result = regexImpl.hashCode();
|
||||
result = 29 * result + term.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
buffer.append("spanRegexQuery(");
|
||||
buffer.append(term);
|
||||
buffer.append(")");
|
||||
buffer.append(ToStringUtils.boost(getBoost()));
|
||||
return buffer.toString();
|
||||
}
|
||||
public Term getTerm() { return query.getTerm(); }
|
||||
|
||||
public void setRegexImplementation(RegexCapabilities impl) {
|
||||
this.regexImpl = impl;
|
||||
query.setRegexImplementation(impl);
|
||||
}
|
||||
|
||||
public RegexCapabilities getRegexImplementation() {
|
||||
return regexImpl;
|
||||
return query.getRegexImplementation();
|
||||
}
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiSearcher;
|
||||
import org.apache.lucene.search.spans.SpanFirstQuery;
|
||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
@ -73,6 +74,65 @@ public class TestSpanRegexQuery extends LuceneTestCase {
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(directory, true);
|
||||
SpanQuery srq = new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery(new Term("field", "aut.*")));
|
||||
SpanFirstQuery sfq = new SpanFirstQuery(srq, 1);
|
||||
// SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6,
|
||||
// true);
|
||||
int numHits = searcher.search(sfq, null, 1000).totalHits;
|
||||
assertEquals(1, numHits);
|
||||
searcher.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testSpanRegexBug() throws CorruptIndexException, IOException {
|
||||
createRAMDirectories();
|
||||
|
||||
SpanQuery srq = new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery(new Term("field", "a.*")));
|
||||
SpanQuery stq = new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery(new Term("field", "b.*")));
|
||||
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] { srq, stq }, 6,
|
||||
true);
|
||||
|
||||
// 1. Search the same store which works
|
||||
IndexSearcher[] arrSearcher = new IndexSearcher[2];
|
||||
arrSearcher[0] = new IndexSearcher(indexStoreA, true);
|
||||
arrSearcher[1] = new IndexSearcher(indexStoreB, true);
|
||||
MultiSearcher searcher = new MultiSearcher(arrSearcher);
|
||||
int numHits = searcher.search(query, null, 1000).totalHits;
|
||||
arrSearcher[0].close();
|
||||
arrSearcher[1].close();
|
||||
|
||||
// Will fail here
|
||||
// We expect 2 but only one matched
|
||||
// The rewriter function only write it once on the first IndexSearcher
|
||||
// So it's using term: a1 b1 to search on the second IndexSearcher
|
||||
// As a result, it won't match the document in the second IndexSearcher
|
||||
assertEquals(2, numHits);
|
||||
indexStoreA.close();
|
||||
indexStoreB.close();
|
||||
}
|
||||
|
||||
/** remove in lucene 4.0 */
|
||||
@Deprecated
|
||||
public void testSpanRegexOld() throws Exception {
|
||||
Directory directory = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||
Document doc = new Document();
|
||||
// doc.add(newField("field", "the quick brown fox jumps over the lazy dog",
|
||||
// Field.Store.NO, Field.Index.ANALYZED));
|
||||
// writer.addDocument(doc);
|
||||
// doc = new Document();
|
||||
doc.add(newField("field", "auto update", Field.Store.NO,
|
||||
Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(newField("field", "first auto update", Field.Store.NO,
|
||||
Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(directory, true);
|
||||
SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "aut.*"));
|
||||
SpanFirstQuery sfq = new SpanFirstQuery(srq, 1);
|
||||
@ -84,7 +144,9 @@ public class TestSpanRegexQuery extends LuceneTestCase {
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testSpanRegexBug() throws CorruptIndexException, IOException {
|
||||
/** remove in lucene 4.0 */
|
||||
@Deprecated
|
||||
public void testSpanRegexBugOld() throws CorruptIndexException, IOException {
|
||||
createRAMDirectories();
|
||||
|
||||
SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "a.*"));
|
||||
|
@ -28,7 +28,8 @@ import java.util.PriorityQueue;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FuzzyTermsEnum;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.BoostAttribute;
|
||||
import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
@ -389,16 +390,16 @@ public class DirectSpellChecker {
|
||||
IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
|
||||
|
||||
AttributeSource atts = new AttributeSource();
|
||||
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
|
||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||
FuzzyTermsEnum e = new FuzzyTermsEnum(ir, atts, term, editDistance, Math.max(minPrefix, editDistance-1));
|
||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||
|
||||
BytesRef queryTerm = new BytesRef(term.text());
|
||||
BytesRef candidateTerm;
|
||||
ScoreTerm st = new ScoreTerm();
|
||||
MultiTermQuery.BoostAttribute boostAtt =
|
||||
e.attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
BoostAttribute boostAtt =
|
||||
e.attributes().addAttribute(BoostAttribute.class);
|
||||
while ((candidateTerm = e.next()) != null) {
|
||||
final float boost = boostAtt.getBoost();
|
||||
// ignore uncompetitive hits
|
||||
|
38
lucene/src/java/org/apache/lucene/search/BoostAttribute.java
Normal file
38
lucene/src/java/org/apache/lucene/search/BoostAttribute.java
Normal file
@ -0,0 +1,38 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource; // javadocs only
|
||||
import org.apache.lucene.index.TermsEnum; // javadocs only
|
||||
|
||||
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}
|
||||
* and update the boost on each returned term. This enables to control the boost factor
|
||||
* for each matching term in {@link MultiTermQuery#SCORING_BOOLEAN_QUERY_REWRITE} or
|
||||
* {@link TopTermsRewrite} mode.
|
||||
* {@link FuzzyQuery} is using this to take the edit distance into account.
|
||||
* <p><b>Please note:</b> This attribute is intended to be added only by the TermsEnum
|
||||
* to itsself in its constructor and consumed by the {@link MultiTermQuery.RewriteMethod}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public interface BoostAttribute extends Attribute {
|
||||
/** Sets the boost in this attribute */
|
||||
public void setBoost(float boost);
|
||||
/** Retrieves the boost, default is {@code 1.0f}. */
|
||||
public float getBoost();
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/** Implementation class for {@link BoostAttribute}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute {
|
||||
private float boost = 1.0f;
|
||||
|
||||
public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
boost = 1.0f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other)
|
||||
return true;
|
||||
if (other instanceof BoostAttributeImpl)
|
||||
return ((BoostAttributeImpl) other).boost == boost;
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Float.floatToIntBits(boost);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
((BoostAttribute) target).setBoost(boost);
|
||||
}
|
||||
}
|
@ -0,0 +1,186 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
|
||||
|
||||
class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
|
||||
|
||||
// Defaults derived from rough tests with a 20.0 million
|
||||
// doc Wikipedia index. With more than 350 terms in the
|
||||
// query, the filter method is fastest:
|
||||
public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
|
||||
|
||||
// If the query will hit more than 1 in 1000 of the docs
|
||||
// in the index (0.1%), the filter method is fastest:
|
||||
public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
|
||||
|
||||
private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
|
||||
private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
|
||||
|
||||
/** If the number of terms in this query is equal to or
|
||||
* larger than this setting then {@link
|
||||
* #CONSTANT_SCORE_FILTER_REWRITE} is used. */
|
||||
public void setTermCountCutoff(int count) {
|
||||
termCountCutoff = count;
|
||||
}
|
||||
|
||||
/** @see #setTermCountCutoff */
|
||||
public int getTermCountCutoff() {
|
||||
return termCountCutoff;
|
||||
}
|
||||
|
||||
/** If the number of documents to be visited in the
|
||||
* postings exceeds this specified percentage of the
|
||||
* maxDoc() for the index, then {@link
|
||||
* #CONSTANT_SCORE_FILTER_REWRITE} is used.
|
||||
* @param percent 0.0 to 100.0 */
|
||||
public void setDocCountPercent(double percent) {
|
||||
docCountPercent = percent;
|
||||
}
|
||||
|
||||
/** @see #setDocCountPercent */
|
||||
public double getDocCountPercent() {
|
||||
return docCountPercent;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BooleanQuery getTopLevelQuery() {
|
||||
return new BooleanQuery(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/) {
|
||||
topLevel.add(new TermQuery(term, docFreq), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
|
||||
// Get the enum and start visiting terms. If we
|
||||
// exhaust the enum before hitting either of the
|
||||
// cutoffs, we use ConstantBooleanQueryRewrite; else,
|
||||
// ConstantFilterRewrite:
|
||||
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
|
||||
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
|
||||
|
||||
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
|
||||
collectTerms(reader, query, col);
|
||||
final int size = col.pendingTerms.size();
|
||||
if (col.hasCutOff) {
|
||||
return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
|
||||
} else if (size == 0) {
|
||||
return getTopLevelQuery();
|
||||
} else {
|
||||
final BooleanQuery bq = getTopLevelQuery();
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final BytesRefHash pendingTerms = col.pendingTerms;
|
||||
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
|
||||
for(int i = 0; i < size; i++) {
|
||||
// docFreq is not used for constant score here, we pass 1
|
||||
// to explicitely set a fake value, so it's not calculated
|
||||
addClause(bq, placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1, 1.0f);
|
||||
}
|
||||
// Strip scores
|
||||
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
|
||||
result.setBoost(query.getBoost());
|
||||
query.incTotalNumberOfTerms(size);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static final class CutOffTermCollector extends TermCollector {
|
||||
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
|
||||
this.docCountCutoff = docCountCutoff;
|
||||
this.termCountLimit = termCountLimit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) throws IOException {
|
||||
pendingTerms.add(bytes);
|
||||
docVisitCount += termsEnum.docFreq();
|
||||
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
|
||||
hasCutOff = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int docVisitCount = 0;
|
||||
boolean hasCutOff = false;
|
||||
TermsEnum termsEnum;
|
||||
|
||||
final int docCountCutoff, termCountLimit;
|
||||
final BytesRefHash pendingTerms = new BytesRefHash();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 1279;
|
||||
return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
|
||||
ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj;
|
||||
if (other.termCountCutoff != termCountCutoff) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
@ -49,12 +49,12 @@ import java.util.List;
|
||||
*/
|
||||
public final class FuzzyTermsEnum extends TermsEnum {
|
||||
private TermsEnum actualEnum;
|
||||
private MultiTermQuery.BoostAttribute actualBoostAtt;
|
||||
private BoostAttribute actualBoostAtt;
|
||||
|
||||
private final MultiTermQuery.BoostAttribute boostAtt =
|
||||
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
private final BoostAttribute boostAtt =
|
||||
attributes().addAttribute(BoostAttribute.class);
|
||||
|
||||
private final MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt;
|
||||
private final MaxNonCompetitiveBoostAttribute maxBoostAtt;
|
||||
private final LevenshteinAutomataAttribute dfaAtt;
|
||||
|
||||
private float bottom;
|
||||
@ -128,7 +128,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||
}
|
||||
this.scale_factor = 1.0f / (1.0f - this.minSimilarity);
|
||||
|
||||
this.maxBoostAtt = atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
|
||||
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||
bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
|
||||
bottomTerm = maxBoostAtt.getCompetitiveTerm();
|
||||
bottomChanged(null, true);
|
||||
@ -174,8 +174,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||
/** swap in a new actual enum to proxy to */
|
||||
private void setEnum(TermsEnum actualEnum) {
|
||||
this.actualEnum = actualEnum;
|
||||
this.actualBoostAtt = actualEnum.attributes().addAttribute(
|
||||
MultiTermQuery.BoostAttribute.class);
|
||||
this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -300,8 +299,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||
private final BytesRef termRef;
|
||||
|
||||
private final BytesRef lastTerm;
|
||||
private final MultiTermQuery.BoostAttribute boostAtt =
|
||||
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
private final BoostAttribute boostAtt =
|
||||
attributes().addAttribute(BoostAttribute.class);
|
||||
|
||||
public AutomatonFuzzyTermsEnum(ByteRunAutomaton matchers[],
|
||||
BytesRef lastTerm) throws IOException {
|
||||
@ -363,8 +362,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||
// this is the text, minus the prefix
|
||||
private final int[] text;
|
||||
|
||||
private final MultiTermQuery.BoostAttribute boostAtt =
|
||||
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
private final BoostAttribute boostAtt =
|
||||
attributes().addAttribute(BoostAttribute.class);
|
||||
|
||||
/**
|
||||
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
|
||||
|
@ -0,0 +1,45 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource; // javadocs only
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling
|
||||
* {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}.
|
||||
* {@link FuzzyQuery} is using this to control its internal behaviour
|
||||
* to only return competitive terms.
|
||||
* <p><b>Please note:</b> This attribute is intended to be added by the {@link MultiTermQuery.RewriteMethod}
|
||||
* to an empty {@link AttributeSource} that is shared for all segments
|
||||
* during query rewrite. This attribute source is passed to all segment enums
|
||||
* on {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}.
|
||||
* {@link TopTermsRewrite} uses this attribute to
|
||||
* inform all enums about the current boost, that is not competitive.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public interface MaxNonCompetitiveBoostAttribute extends Attribute {
|
||||
/** This is the maximum boost that would not be competitive. */
|
||||
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
|
||||
/** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */
|
||||
public float getMaxNonCompetitiveBoost();
|
||||
/** This is the term or <code>null</code> of the term that triggered the boost change. */
|
||||
public void setCompetitiveTerm(BytesRef competitiveTerm);
|
||||
/** This is the term or <code>null</code> of the term that triggered the boost change. Default is <code>null</code>, which means every term is competitoive. */
|
||||
public BytesRef getCompetitiveTerm();
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Implementation class for {@link MaxNonCompetitiveBoostAttribute}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute {
|
||||
private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
|
||||
private BytesRef competitiveTerm = null;
|
||||
|
||||
public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) {
|
||||
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
public float getMaxNonCompetitiveBoost() {
|
||||
return maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
public void setCompetitiveTerm(final BytesRef competitiveTerm) {
|
||||
this.competitiveTerm = competitiveTerm;
|
||||
}
|
||||
|
||||
public BytesRef getCompetitiveTerm() {
|
||||
return competitiveTerm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
|
||||
competitiveTerm = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other)
|
||||
return true;
|
||||
if (other instanceof MaxNonCompetitiveBoostAttributeImpl) {
|
||||
final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other;
|
||||
return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost)
|
||||
&& (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = Float.floatToIntBits(maxNonCompetitiveBoost);
|
||||
if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target;
|
||||
t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
|
||||
t.setCompetitiveTerm(competitiveTerm);
|
||||
}
|
||||
}
|
@ -19,29 +19,12 @@ package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
|
||||
|
||||
/**
|
||||
* An abstract {@link Query} that matches documents
|
||||
@ -80,156 +63,12 @@ public abstract class MultiTermQuery extends Query {
|
||||
protected final String field;
|
||||
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
|
||||
transient int numberOfTerms = 0;
|
||||
|
||||
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum(IndexReader,AttributeSource)}
|
||||
* and update the boost on each returned term. This enables to control the boost factor
|
||||
* for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or
|
||||
* {@link TopTermsBooleanQueryRewrite} mode.
|
||||
* {@link FuzzyQuery} is using this to take the edit distance into account.
|
||||
* <p><b>Please note:</b> This attribute is intended to be added only by the TermsEnum
|
||||
* to itsself in its constructor and consumed by the {@link RewriteMethod}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static interface BoostAttribute extends Attribute {
|
||||
/** Sets the boost in this attribute */
|
||||
public void setBoost(float boost);
|
||||
/** Retrieves the boost, default is {@code 1.0f}. */
|
||||
public float getBoost();
|
||||
}
|
||||
|
||||
/** Implementation class for {@link BoostAttribute}. */
|
||||
public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute {
|
||||
private float boost = 1.0f;
|
||||
|
||||
public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
boost = 1.0f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other)
|
||||
return true;
|
||||
if (other instanceof BoostAttributeImpl)
|
||||
return ((BoostAttributeImpl) other).boost == boost;
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Float.floatToIntBits(boost);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
((BoostAttribute) target).setBoost(boost);
|
||||
}
|
||||
}
|
||||
|
||||
/** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling
|
||||
* {@link #getTermsEnum(IndexReader,AttributeSource)}.
|
||||
* {@link FuzzyQuery} is using this to control its internal behaviour
|
||||
* to only return competitive terms.
|
||||
* <p><b>Please note:</b> This attribute is intended to be added by the {@link RewriteMethod}
|
||||
* to an empty {@link AttributeSource} that is shared for all segments
|
||||
* during query rewrite. This attribute source is passed to all segment enums
|
||||
* on {@link #getTermsEnum(IndexReader,AttributeSource)}.
|
||||
* {@link TopTermsBooleanQueryRewrite} uses this attribute to
|
||||
* inform all enums about the current boost, that is not competitive.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static interface MaxNonCompetitiveBoostAttribute extends Attribute {
|
||||
/** This is the maximum boost that would not be competitive. */
|
||||
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
|
||||
/** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */
|
||||
public float getMaxNonCompetitiveBoost();
|
||||
/** This is the term or <code>null<code> of the term that triggered the boost change. */
|
||||
public void setCompetitiveTerm(BytesRef competitiveTerm);
|
||||
/** This is the term or <code>null<code> of the term that triggered the boost change. Default is <code>null</code>, which means every term is competitoive. */
|
||||
public BytesRef getCompetitiveTerm();
|
||||
}
|
||||
|
||||
/** Implementation class for {@link MaxNonCompetitiveBoostAttribute}. */
|
||||
public static final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute {
|
||||
private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
|
||||
private BytesRef competitiveTerm = null;
|
||||
|
||||
public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) {
|
||||
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
public float getMaxNonCompetitiveBoost() {
|
||||
return maxNonCompetitiveBoost;
|
||||
}
|
||||
|
||||
public void setCompetitiveTerm(final BytesRef competitiveTerm) {
|
||||
this.competitiveTerm = competitiveTerm;
|
||||
}
|
||||
|
||||
public BytesRef getCompetitiveTerm() {
|
||||
return competitiveTerm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
|
||||
competitiveTerm = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other)
|
||||
return true;
|
||||
if (other instanceof MaxNonCompetitiveBoostAttributeImpl) {
|
||||
final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other;
|
||||
return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost)
|
||||
&& (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = Float.floatToIntBits(maxNonCompetitiveBoost);
|
||||
if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target;
|
||||
t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
|
||||
t.setCompetitiveTerm(competitiveTerm);
|
||||
}
|
||||
}
|
||||
|
||||
/** Abstract class that defines how the query is rewritten. */
|
||||
public static abstract class RewriteMethod implements Serializable {
|
||||
public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException;
|
||||
}
|
||||
|
||||
private static final class ConstantScoreFilterRewrite extends RewriteMethod {
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader, MultiTermQuery query) {
|
||||
Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
|
||||
result.setBoost(query.getBoost());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Make sure we are still a singleton even after deserializing
|
||||
protected Object readResolve() {
|
||||
return CONSTANT_SCORE_FILTER_REWRITE;
|
||||
}
|
||||
}
|
||||
|
||||
/** A rewrite method that first creates a private Filter,
|
||||
* by visiting each term in sequence and marking all docs
|
||||
* for that term. Matching documents are assigned a
|
||||
@ -242,162 +81,19 @@ public abstract class MultiTermQuery extends Query {
|
||||
* exception.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite();
|
||||
|
||||
private abstract static class BooleanQueryRewrite extends RewriteMethod {
|
||||
|
||||
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
|
||||
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
|
||||
ReaderUtil.gatherSubReaders(subReaders, reader);
|
||||
Comparator<BytesRef> lastTermComp = null;
|
||||
|
||||
for (IndexReader r : subReaders) {
|
||||
final Fields fields = r.fields();
|
||||
if (fields == null) {
|
||||
// reader has no fields
|
||||
continue;
|
||||
}
|
||||
|
||||
final Terms terms = fields.terms(query.field);
|
||||
if (terms == null) {
|
||||
// field does not exist
|
||||
continue;
|
||||
}
|
||||
|
||||
final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes);
|
||||
assert termsEnum != null;
|
||||
|
||||
if (termsEnum == TermsEnum.EMPTY)
|
||||
continue;
|
||||
|
||||
// Check comparator compatibility:
|
||||
final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
|
||||
if (lastTermComp != null && newTermComp != lastTermComp)
|
||||
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
|
||||
lastTermComp = newTermComp;
|
||||
|
||||
collector.setNextEnum(termsEnum);
|
||||
BytesRef bytes;
|
||||
while ((bytes = termsEnum.next()) != null) {
|
||||
termsEnum.cacheCurrentTerm();
|
||||
if (!collector.collect(bytes))
|
||||
return; // interrupt whole term collection, so also don't iterate other subReaders
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static abstract class TermCollector {
|
||||
/** attributes used for communication with the enum */
|
||||
public final AttributeSource attributes = new AttributeSource();
|
||||
|
||||
/** return false to stop collecting */
|
||||
public abstract boolean collect(BytesRef bytes) throws IOException;
|
||||
|
||||
/** the next segment's {@link TermsEnum} that is used to collect terms */
|
||||
public abstract void setNextEnum(TermsEnum termsEnum) throws IOException;
|
||||
}
|
||||
}
|
||||
|
||||
private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite {
|
||||
public static final RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new RewriteMethod() {
|
||||
@Override
|
||||
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
|
||||
collectTerms(reader, query, col);
|
||||
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final BooleanQuery result = new BooleanQuery(true);
|
||||
final int size = col.terms.size();
|
||||
if (size > 0) {
|
||||
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
|
||||
final int[] docFreq = col.array.docFreq;
|
||||
final float[] boost = col.array.boost;
|
||||
for (int i = 0; i < size; i++) {
|
||||
final int pos = sort[i];
|
||||
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
|
||||
assert reader.docFreq(term) == docFreq[pos];
|
||||
final TermQuery tq = new TermQuery(term, docFreq[pos]);
|
||||
tq.setBoost(query.getBoost() * boost[pos]);
|
||||
result.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
}
|
||||
query.incTotalNumberOfTerms(size);
|
||||
public Query rewrite(IndexReader reader, MultiTermQuery query) {
|
||||
Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
|
||||
result.setBoost(query.getBoost());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Make sure we are still a singleton even after deserializing
|
||||
protected Object readResolve() {
|
||||
return SCORING_BOOLEAN_QUERY_REWRITE;
|
||||
return CONSTANT_SCORE_FILTER_REWRITE;
|
||||
}
|
||||
|
||||
static final class ParallelArraysTermCollector extends TermCollector {
|
||||
final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16);
|
||||
final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
|
||||
TermsEnum termsEnum;
|
||||
|
||||
private BoostAttribute boostAtt;
|
||||
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) {
|
||||
final int e = terms.add(bytes);
|
||||
if (e < 0 ) {
|
||||
// duplicate term: update docFreq
|
||||
final int pos = (-e)-1;
|
||||
array.docFreq[pos] += termsEnum.docFreq();
|
||||
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
|
||||
} else {
|
||||
// new entry: we populate the entry initially
|
||||
array.docFreq[e] = termsEnum.docFreq();
|
||||
array.boost[e] = boostAtt.getBoost();
|
||||
}
|
||||
// if the new entry reaches the max clause count, we exit early
|
||||
if (e >= BooleanQuery.getMaxClauseCount())
|
||||
throw new BooleanQuery.TooManyClauses();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
|
||||
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
|
||||
int[] docFreq;
|
||||
float[] boost;
|
||||
|
||||
public TermFreqBoostByteStart(int initSize) {
|
||||
super(initSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] init() {
|
||||
final int[] ord = super.init();
|
||||
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
|
||||
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] grow() {
|
||||
final int[] ord = super.grow();
|
||||
docFreq = ArrayUtil.grow(docFreq, ord.length);
|
||||
boost = ArrayUtil.grow(boost, ord.length);
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] clear() {
|
||||
boost = null;
|
||||
docFreq = null;
|
||||
return super.clear();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/** A rewrite method that first translates each term into
|
||||
* {@link BooleanClause.Occur#SHOULD} clause in a
|
||||
@ -412,155 +108,19 @@ public abstract class MultiTermQuery extends Query {
|
||||
* exceeds {@link BooleanQuery#getMaxClauseCount}.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
|
||||
|
||||
|
||||
/**
|
||||
* Base rewrite method for collecting only the top terms
|
||||
* via a priority queue.
|
||||
*/
|
||||
public static abstract class TopTermsBooleanQueryRewrite extends BooleanQueryRewrite {
|
||||
private final int size;
|
||||
|
||||
/**
|
||||
* Create a TopTermsBooleanQueryRewrite for
|
||||
* at most <code>size</code> terms.
|
||||
* <p>
|
||||
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
|
||||
* <code>size</code>, then it will be used instead.
|
||||
*/
|
||||
public TopTermsBooleanQueryRewrite(int size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
/** Return a suitable Query for a MultiTermQuery term. */
|
||||
protected abstract Query getQuery(Term term, int docCount);
|
||||
|
||||
@Override
|
||||
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
|
||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||
collectTerms(reader, query, new TermCollector() {
|
||||
private final MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||
|
||||
private final Map<BytesRef,ScoreTerm> visitedTerms = new HashMap<BytesRef,ScoreTerm>();
|
||||
|
||||
private TermsEnum termsEnum;
|
||||
private Comparator<BytesRef> termComp;
|
||||
private BoostAttribute boostAtt;
|
||||
private ScoreTerm st;
|
||||
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
this.termComp = termsEnum.getComparator();
|
||||
// lazy init the initial ScoreTerm because comparator is not known on ctor:
|
||||
if (st == null)
|
||||
st = new ScoreTerm(this.termComp);
|
||||
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) {
|
||||
final float boost = boostAtt.getBoost();
|
||||
// ignore uncompetetive hits
|
||||
if (stQueue.size() == maxSize) {
|
||||
final ScoreTerm t = stQueue.peek();
|
||||
if (boost < t.boost)
|
||||
return true;
|
||||
if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
|
||||
return true;
|
||||
}
|
||||
ScoreTerm t = visitedTerms.get(bytes);
|
||||
if (t != null) {
|
||||
// if the term is already in the PQ, only update docFreq of term in PQ
|
||||
t.docFreq += termsEnum.docFreq();
|
||||
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
|
||||
} else {
|
||||
// add new entry in PQ, we must clone the term, else it may get overwritten!
|
||||
st.bytes.copy(bytes);
|
||||
st.boost = boost;
|
||||
st.docFreq = termsEnum.docFreq();
|
||||
visitedTerms.put(st.bytes, st);
|
||||
stQueue.offer(st);
|
||||
// possibly drop entries from queue
|
||||
if (stQueue.size() > maxSize) {
|
||||
st = stQueue.poll();
|
||||
visitedTerms.remove(st.bytes);
|
||||
} else {
|
||||
st = new ScoreTerm(termComp);
|
||||
}
|
||||
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
|
||||
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
|
||||
if (stQueue.size() == maxSize) {
|
||||
t = stQueue.peek();
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
|
||||
maxBoostAtt.setCompetitiveTerm(t.bytes);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final BooleanQuery bq = new BooleanQuery(true);
|
||||
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
|
||||
ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp);
|
||||
for (final ScoreTerm st : scoreTerms) {
|
||||
final Term term = placeholderTerm.createTerm(st.bytes);
|
||||
assert reader.docFreq(term) == st.docFreq;
|
||||
Query tq = getQuery(term, st.docFreq);
|
||||
tq.setBoost(query.getBoost() * st.boost); // set the boost
|
||||
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||
}
|
||||
query.incTotalNumberOfTerms(scoreTerms.length);
|
||||
return bq;
|
||||
}
|
||||
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = ScoringRewrite.SCORING_BOOLEAN_QUERY_REWRITE;
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return 31 * size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
TopTermsBooleanQueryRewrite other = (TopTermsBooleanQueryRewrite) obj;
|
||||
if (size != other.size) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static final Comparator<ScoreTerm> scoreTermSortByTermComp =
|
||||
new Comparator<ScoreTerm>() {
|
||||
public int compare(ScoreTerm st1, ScoreTerm st2) {
|
||||
assert st1.termComp == st2.termComp :
|
||||
"term comparator should not change between segments";
|
||||
return st1.termComp.compare(st1.bytes, st2.bytes);
|
||||
}
|
||||
};
|
||||
|
||||
static final class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
public final Comparator<BytesRef> termComp;
|
||||
|
||||
public final BytesRef bytes = new BytesRef();
|
||||
public float boost;
|
||||
public int docFreq;
|
||||
|
||||
public ScoreTerm(Comparator<BytesRef> termComp) {
|
||||
this.termComp = termComp;
|
||||
}
|
||||
|
||||
public int compareTo(ScoreTerm other) {
|
||||
if (this.boost == other.boost)
|
||||
return termComp.compare(other.bytes, this.bytes);
|
||||
else
|
||||
return Float.compare(this.boost, other.boost);
|
||||
}
|
||||
}
|
||||
}
|
||||
/** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except
|
||||
* scores are not computed. Instead, each matching
|
||||
* document receives a constant score equal to the
|
||||
* query's boost.
|
||||
*
|
||||
* <p><b>NOTE</b>: This rewrite method will hit {@link
|
||||
* BooleanQuery.TooManyClauses} if the number of terms
|
||||
* exceeds {@link BooleanQuery#getMaxClauseCount}.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = ScoringRewrite.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
|
||||
|
||||
/**
|
||||
* A rewrite method that first translates each term into
|
||||
@ -574,8 +134,7 @@ public abstract class MultiTermQuery extends Query {
|
||||
*
|
||||
* @see #setRewriteMethod
|
||||
*/
|
||||
public static final class TopTermsScoringBooleanQueryRewrite extends
|
||||
TopTermsBooleanQueryRewrite {
|
||||
public static final class TopTermsScoringBooleanQueryRewrite extends TopTermsRewrite<BooleanQuery> {
|
||||
|
||||
/**
|
||||
* Create a TopTermsScoringBooleanQueryRewrite for
|
||||
@ -589,8 +148,20 @@ public abstract class MultiTermQuery extends Query {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Query getQuery(Term term, int docFreq) {
|
||||
return new TermQuery(term, docFreq);
|
||||
protected int getMaxSize() {
|
||||
return BooleanQuery.getMaxClauseCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BooleanQuery getTopLevelQuery() {
|
||||
return new BooleanQuery(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
|
||||
final TermQuery tq = new TermQuery(term, docCount);
|
||||
tq.setBoost(boost);
|
||||
topLevel.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
}
|
||||
|
||||
@ -604,8 +175,7 @@ public abstract class MultiTermQuery extends Query {
|
||||
*
|
||||
* @see #setRewriteMethod
|
||||
*/
|
||||
public static final class TopTermsBoostOnlyBooleanQueryRewrite extends
|
||||
TopTermsBooleanQueryRewrite {
|
||||
public static final class TopTermsBoostOnlyBooleanQueryRewrite extends TopTermsRewrite<BooleanQuery> {
|
||||
|
||||
/**
|
||||
* Create a TopTermsBoostOnlyBooleanQueryRewrite for
|
||||
@ -619,45 +189,23 @@ public abstract class MultiTermQuery extends Query {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Query getQuery(Term term, int docFreq) {
|
||||
return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq)));
|
||||
protected int getMaxSize() {
|
||||
return BooleanQuery.getMaxClauseCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BooleanQuery getTopLevelQuery() {
|
||||
return new BooleanQuery(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost) {
|
||||
final Query q = new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq)));
|
||||
q.setBoost(boost);
|
||||
topLevel.add(q, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
}
|
||||
|
||||
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
|
||||
Query result = super.rewrite(reader, query);
|
||||
assert result instanceof BooleanQuery;
|
||||
// TODO: if empty boolean query return NullQuery?
|
||||
if (!((BooleanQuery) result).clauses().isEmpty()) {
|
||||
// strip the scores off
|
||||
result = new ConstantScoreQuery(new QueryWrapperFilter(result));
|
||||
result.setBoost(query.getBoost());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Make sure we are still a singleton even after deserializing
|
||||
@Override
|
||||
protected Object readResolve() {
|
||||
return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
|
||||
}
|
||||
}
|
||||
|
||||
/** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except
|
||||
* scores are not computed. Instead, each matching
|
||||
* document receives a constant score equal to the
|
||||
* query's boost.
|
||||
*
|
||||
* <p><b>NOTE</b>: This rewrite method will hit {@link
|
||||
* BooleanQuery.TooManyClauses} if the number of terms
|
||||
* exceeds {@link BooleanQuery#getMaxClauseCount}.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite();
|
||||
|
||||
|
||||
|
||||
/** A rewrite method that tries to pick the best
|
||||
* constant-score rewrite method based on term and
|
||||
* document counts from the query. If both the number of
|
||||
@ -666,140 +214,7 @@ public abstract class MultiTermQuery extends Query {
|
||||
* Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is
|
||||
* used.
|
||||
*/
|
||||
public static class ConstantScoreAutoRewrite extends BooleanQueryRewrite {
|
||||
|
||||
// Defaults derived from rough tests with a 20.0 million
|
||||
// doc Wikipedia index. With more than 350 terms in the
|
||||
// query, the filter method is fastest:
|
||||
public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
|
||||
|
||||
// If the query will hit more than 1 in 1000 of the docs
|
||||
// in the index (0.1%), the filter method is fastest:
|
||||
public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
|
||||
|
||||
private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
|
||||
private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
|
||||
|
||||
/** If the number of terms in this query is equal to or
|
||||
* larger than this setting then {@link
|
||||
* #CONSTANT_SCORE_FILTER_REWRITE} is used. */
|
||||
public void setTermCountCutoff(int count) {
|
||||
termCountCutoff = count;
|
||||
}
|
||||
|
||||
/** @see #setTermCountCutoff */
|
||||
public int getTermCountCutoff() {
|
||||
return termCountCutoff;
|
||||
}
|
||||
|
||||
/** If the number of documents to be visited in the
|
||||
* postings exceeds this specified percentage of the
|
||||
* maxDoc() for the index, then {@link
|
||||
* #CONSTANT_SCORE_FILTER_REWRITE} is used.
|
||||
* @param percent 0.0 to 100.0 */
|
||||
public void setDocCountPercent(double percent) {
|
||||
docCountPercent = percent;
|
||||
}
|
||||
|
||||
/** @see #setDocCountPercent */
|
||||
public double getDocCountPercent() {
|
||||
return docCountPercent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
|
||||
// Get the enum and start visiting terms. If we
|
||||
// exhaust the enum before hitting either of the
|
||||
// cutoffs, we use ConstantBooleanQueryRewrite; else,
|
||||
// ConstantFilterRewrite:
|
||||
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
|
||||
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
|
||||
|
||||
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
|
||||
collectTerms(reader, query, col);
|
||||
final int size = col.pendingTerms.size();
|
||||
if (col.hasCutOff) {
|
||||
return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
|
||||
} else if (size == 0) {
|
||||
return new BooleanQuery(true);
|
||||
} else {
|
||||
final BooleanQuery bq = new BooleanQuery(true);
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final BytesRefHash pendingTerms = col.pendingTerms;
|
||||
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
|
||||
for(int i = 0; i < size; i++) {
|
||||
// docFreq is not used for constant score here, we pass 1
|
||||
// to explicitely set a fake value, so it's not calculated
|
||||
bq.add(new TermQuery(
|
||||
placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1
|
||||
), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
// Strip scores
|
||||
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
|
||||
result.setBoost(query.getBoost());
|
||||
query.incTotalNumberOfTerms(size);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static final class CutOffTermCollector extends TermCollector {
|
||||
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
|
||||
this.docCountCutoff = docCountCutoff;
|
||||
this.termCountLimit = termCountLimit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) throws IOException {
|
||||
pendingTerms.add(bytes);
|
||||
docVisitCount += termsEnum.docFreq();
|
||||
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
|
||||
hasCutOff = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int docVisitCount = 0;
|
||||
boolean hasCutOff = false;
|
||||
TermsEnum termsEnum;
|
||||
|
||||
final int docCountCutoff, termCountLimit;
|
||||
final BytesRefHash pendingTerms = new BytesRefHash();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 1279;
|
||||
return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
|
||||
ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj;
|
||||
if (other.termCountCutoff != termCountCutoff) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
public static class ConstantScoreAutoRewrite extends org.apache.lucene.search.ConstantScoreAutoRewrite {}
|
||||
|
||||
/** Read-only default instance of {@link
|
||||
* ConstantScoreAutoRewrite}, with {@link
|
||||
@ -851,7 +266,7 @@ public abstract class MultiTermQuery extends Query {
|
||||
* positioned to the first matching term.
|
||||
* The given {@link AttributeSource} is passed by the {@link RewriteMethod} to
|
||||
* provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts.
|
||||
* This is currently only used by {@link TopTermsBooleanQueryRewrite}
|
||||
* This is currently only used by {@link TopTermsRewrite}
|
||||
*/
|
||||
protected abstract TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException;
|
||||
|
||||
|
203
lucene/src/java/org/apache/lucene/search/ScoringRewrite.java
Normal file
203
lucene/src/java/org/apache/lucene/search/ScoringRewrite.java
Normal file
@ -0,0 +1,203 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
|
||||
|
||||
/** @lucene.internal Only public to be accessible by spans package. */
|
||||
public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewrite<Q> {
|
||||
|
||||
/** A rewrite method that first translates each term into
|
||||
* {@link BooleanClause.Occur#SHOULD} clause in a
|
||||
* BooleanQuery, and keeps the scores as computed by the
|
||||
* query. Note that typically such scores are
|
||||
* meaningless to the user, and require non-trivial CPU
|
||||
* to compute, so it's almost always better to use {@link
|
||||
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead.
|
||||
*
|
||||
* <p><b>NOTE</b>: This rewrite method will hit {@link
|
||||
* BooleanQuery.TooManyClauses} if the number of terms
|
||||
* exceeds {@link BooleanQuery#getMaxClauseCount}.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringRewrite<BooleanQuery>() {
|
||||
@Override
|
||||
protected BooleanQuery getTopLevelQuery() {
|
||||
return new BooleanQuery(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
|
||||
final TermQuery tq = new TermQuery(term, docCount);
|
||||
tq.setBoost(boost);
|
||||
topLevel.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
||||
// Make sure we are still a singleton even after deserializing
|
||||
protected Object readResolve() {
|
||||
return SCORING_BOOLEAN_QUERY_REWRITE;
|
||||
}
|
||||
};
|
||||
|
||||
/** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except
|
||||
* scores are not computed. Instead, each matching
|
||||
* document receives a constant score equal to the
|
||||
* query's boost.
|
||||
*
|
||||
* <p><b>NOTE</b>: This rewrite method will hit {@link
|
||||
* BooleanQuery.TooManyClauses} if the number of terms
|
||||
* exceeds {@link BooleanQuery#getMaxClauseCount}.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new RewriteMethod() {
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
|
||||
Query result = SCORING_BOOLEAN_QUERY_REWRITE.rewrite(reader, query);
|
||||
assert result instanceof BooleanQuery;
|
||||
// TODO: if empty boolean query return NullQuery?
|
||||
if (!((BooleanQuery) result).clauses().isEmpty()) {
|
||||
// strip the scores off
|
||||
result = new ConstantScoreQuery(new QueryWrapperFilter(result));
|
||||
result.setBoost(query.getBoost());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Make sure we are still a singleton even after deserializing
|
||||
protected Object readResolve() {
|
||||
return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
|
||||
}
|
||||
};
|
||||
|
||||
@Override
|
||||
public final Q rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
final Q result = getTopLevelQuery();
|
||||
final ParallelArraysTermCollector col = new ParallelArraysTermCollector(result instanceof BooleanQuery);
|
||||
collectTerms(reader, query, col);
|
||||
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final int size = col.terms.size();
|
||||
if (size > 0) {
|
||||
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
|
||||
final int[] docFreq = col.array.docFreq;
|
||||
final float[] boost = col.array.boost;
|
||||
for (int i = 0; i < size; i++) {
|
||||
final int pos = sort[i];
|
||||
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
|
||||
assert reader.docFreq(term) == docFreq[pos];
|
||||
addClause(result, term, docFreq[pos], query.getBoost() * boost[pos]);
|
||||
}
|
||||
}
|
||||
query.incTotalNumberOfTerms(size);
|
||||
return result;
|
||||
}
|
||||
|
||||
static final class ParallelArraysTermCollector extends TermCollector {
|
||||
private final boolean checkMaxClauseCount;
|
||||
final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16);
|
||||
final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
|
||||
TermsEnum termsEnum;
|
||||
|
||||
private BoostAttribute boostAtt;
|
||||
|
||||
public ParallelArraysTermCollector(boolean checkMaxClauseCount) {
|
||||
this.checkMaxClauseCount = checkMaxClauseCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) {
|
||||
final int e = terms.add(bytes);
|
||||
if (e < 0 ) {
|
||||
// duplicate term: update docFreq
|
||||
final int pos = (-e)-1;
|
||||
array.docFreq[pos] += termsEnum.docFreq();
|
||||
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
|
||||
} else {
|
||||
// new entry: we populate the entry initially
|
||||
array.docFreq[e] = termsEnum.docFreq();
|
||||
array.boost[e] = boostAtt.getBoost();
|
||||
}
|
||||
// if the new entry reaches the max clause count, we exit early
|
||||
if (checkMaxClauseCount && e >= BooleanQuery.getMaxClauseCount())
|
||||
throw new BooleanQuery.TooManyClauses();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
|
||||
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
|
||||
int[] docFreq;
|
||||
float[] boost;
|
||||
|
||||
public TermFreqBoostByteStart(int initSize) {
|
||||
super(initSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] init() {
|
||||
final int[] ord = super.init();
|
||||
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
|
||||
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] grow() {
|
||||
final int[] ord = super.grow();
|
||||
docFreq = ArrayUtil.grow(docFreq, ord.length);
|
||||
boost = ArrayUtil.grow(boost, ord.length);
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] clear() {
|
||||
boost = null;
|
||||
docFreq = null;
|
||||
return super.clear();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,93 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
|
||||
abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod {
|
||||
|
||||
/** Return a suitable top-level Query for holding all expanded terms. */
|
||||
protected abstract Q getTopLevelQuery() throws IOException;
|
||||
|
||||
/** Add a MultiTermQuery term to the top-level query */
|
||||
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost) throws IOException;
|
||||
|
||||
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
|
||||
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
|
||||
ReaderUtil.gatherSubReaders(subReaders, reader);
|
||||
Comparator<BytesRef> lastTermComp = null;
|
||||
|
||||
for (IndexReader r : subReaders) {
|
||||
final Fields fields = r.fields();
|
||||
if (fields == null) {
|
||||
// reader has no fields
|
||||
continue;
|
||||
}
|
||||
|
||||
final Terms terms = fields.terms(query.field);
|
||||
if (terms == null) {
|
||||
// field does not exist
|
||||
continue;
|
||||
}
|
||||
|
||||
final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes);
|
||||
assert termsEnum != null;
|
||||
|
||||
if (termsEnum == TermsEnum.EMPTY)
|
||||
continue;
|
||||
|
||||
// Check comparator compatibility:
|
||||
final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
|
||||
if (lastTermComp != null && newTermComp != lastTermComp)
|
||||
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
|
||||
lastTermComp = newTermComp;
|
||||
|
||||
collector.setNextEnum(termsEnum);
|
||||
BytesRef bytes;
|
||||
while ((bytes = termsEnum.next()) != null) {
|
||||
termsEnum.cacheCurrentTerm();
|
||||
if (!collector.collect(bytes))
|
||||
return; // interrupt whole term collection, so also don't iterate other subReaders
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static abstract class TermCollector {
|
||||
/** attributes used for communication with the enum */
|
||||
public final AttributeSource attributes = new AttributeSource();
|
||||
|
||||
/** return false to stop collecting */
|
||||
public abstract boolean collect(BytesRef bytes) throws IOException;
|
||||
|
||||
/** the next segment's {@link TermsEnum} that is used to collect terms */
|
||||
public abstract void setNextEnum(TermsEnum termsEnum) throws IOException;
|
||||
}
|
||||
}
|
182
lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java
Normal file
182
lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java
Normal file
@ -0,0 +1,182 @@
|
||||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Base rewrite method for collecting only the top terms
|
||||
* via a priority queue.
|
||||
* @lucene.internal Only public to be accessible by spans package.
|
||||
*/
|
||||
public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRewrite<Q> {
|
||||
|
||||
private final int size;
|
||||
|
||||
/**
|
||||
* Create a TopTermsBooleanQueryRewrite for
|
||||
* at most <code>size</code> terms.
|
||||
* <p>
|
||||
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
|
||||
* <code>size</code>, then it will be used instead.
|
||||
*/
|
||||
public TopTermsRewrite(int size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
/** return the maximum priority queue size */
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/** return the maximum size of the priority queue (for boolean rewrites this is BooleanQuery#getMaxClauseCount). */
|
||||
protected abstract int getMaxSize();
|
||||
|
||||
@Override
|
||||
public final Q rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
|
||||
final int maxSize = Math.min(size, getMaxSize());
|
||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||
collectTerms(reader, query, new TermCollector() {
|
||||
private final MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||
|
||||
private final Map<BytesRef,ScoreTerm> visitedTerms = new HashMap<BytesRef,ScoreTerm>();
|
||||
|
||||
private TermsEnum termsEnum;
|
||||
private Comparator<BytesRef> termComp;
|
||||
private BoostAttribute boostAtt;
|
||||
private ScoreTerm st;
|
||||
|
||||
@Override
|
||||
public void setNextEnum(TermsEnum termsEnum) throws IOException {
|
||||
this.termsEnum = termsEnum;
|
||||
this.termComp = termsEnum.getComparator();
|
||||
// lazy init the initial ScoreTerm because comparator is not known on ctor:
|
||||
if (st == null)
|
||||
st = new ScoreTerm(this.termComp);
|
||||
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) {
|
||||
final float boost = boostAtt.getBoost();
|
||||
// ignore uncompetetive hits
|
||||
if (stQueue.size() == maxSize) {
|
||||
final ScoreTerm t = stQueue.peek();
|
||||
if (boost < t.boost)
|
||||
return true;
|
||||
if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
|
||||
return true;
|
||||
}
|
||||
ScoreTerm t = visitedTerms.get(bytes);
|
||||
if (t != null) {
|
||||
// if the term is already in the PQ, only update docFreq of term in PQ
|
||||
t.docFreq += termsEnum.docFreq();
|
||||
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
|
||||
} else {
|
||||
// add new entry in PQ, we must clone the term, else it may get overwritten!
|
||||
st.bytes.copy(bytes);
|
||||
st.boost = boost;
|
||||
st.docFreq = termsEnum.docFreq();
|
||||
visitedTerms.put(st.bytes, st);
|
||||
stQueue.offer(st);
|
||||
// possibly drop entries from queue
|
||||
if (stQueue.size() > maxSize) {
|
||||
st = stQueue.poll();
|
||||
visitedTerms.remove(st.bytes);
|
||||
} else {
|
||||
st = new ScoreTerm(termComp);
|
||||
}
|
||||
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
|
||||
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
|
||||
if (stQueue.size() == maxSize) {
|
||||
t = stQueue.peek();
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
|
||||
maxBoostAtt.setCompetitiveTerm(t.bytes);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
final Term placeholderTerm = new Term(query.field);
|
||||
final Q q = getTopLevelQuery();
|
||||
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
|
||||
ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp);
|
||||
for (final ScoreTerm st : scoreTerms) {
|
||||
final Term term = placeholderTerm.createTerm(st.bytes);
|
||||
assert reader.docFreq(term) == st.docFreq;
|
||||
addClause(q, term, st.docFreq, query.getBoost() * st.boost); // add to query
|
||||
}
|
||||
query.incTotalNumberOfTerms(scoreTerms.length);
|
||||
return q;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return 31 * size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
final TopTermsRewrite other = (TopTermsRewrite) obj;
|
||||
if (size != other.size) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static final Comparator<ScoreTerm> scoreTermSortByTermComp =
|
||||
new Comparator<ScoreTerm>() {
|
||||
public int compare(ScoreTerm st1, ScoreTerm st2) {
|
||||
assert st1.termComp == st2.termComp :
|
||||
"term comparator should not change between segments";
|
||||
return st1.termComp.compare(st1.bytes, st2.bytes);
|
||||
}
|
||||
};
|
||||
|
||||
static final class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
public final Comparator<BytesRef> termComp;
|
||||
|
||||
public final BytesRef bytes = new BytesRef();
|
||||
public float boost;
|
||||
public int docFreq;
|
||||
|
||||
public ScoreTerm(Comparator<BytesRef> termComp) {
|
||||
this.termComp = termComp;
|
||||
}
|
||||
|
||||
public int compareTo(ScoreTerm other) {
|
||||
if (this.boost == other.boost)
|
||||
return termComp.compare(other.bytes, this.bytes);
|
||||
else
|
||||
return Float.compare(this.boost, other.boost);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,234 @@
|
||||
package org.apache.lucene.search.spans;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TopTermsRewrite;
|
||||
import org.apache.lucene.search.ScoringRewrite;
|
||||
import org.apache.lucene.search.BooleanClause.Occur; // javadocs only
|
||||
|
||||
/**
|
||||
* Wraps any {@link MultiTermQuery} as a {@link SpanQuery},
|
||||
* so it can be nested within other SpanQuery classes.
|
||||
* <p>
|
||||
* The query is rewritten by default to a {@link SpanOrQuery} containing
|
||||
* the expanded terms, but this can be customized.
|
||||
* <p>
|
||||
* Example:
|
||||
* <blockquote><pre>
|
||||
* {@code
|
||||
* WildcardQuery wildcard = new WildcardQuery(new Term("field", "bro?n"));
|
||||
* SpanQuery spanWildcard = new SpanMultiTermQueryWrapper<WildcardQuery>(wildcard);
|
||||
* // do something with spanWildcard, such as use it in a SpanFirstQuery
|
||||
* }
|
||||
* </pre></blockquote>
|
||||
*/
|
||||
public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQuery {
|
||||
protected final Q query;
|
||||
|
||||
/**
|
||||
* Create a new SpanMultiTermQueryWrapper.
|
||||
*
|
||||
* @param query Query to wrap.
|
||||
* <p>
|
||||
* NOTE: This will call {@link MultiTermQuery#setRewriteMethod(MultiTermQuery.RewriteMethod)}
|
||||
* on the wrapped <code>query</code>, changing its rewrite method to a suitable one for spans.
|
||||
* Be sure to not change the rewrite method on the wrapped query afterwards! Doing so will
|
||||
* throw {@link UnsupportedOperationException} on rewriting this query!
|
||||
*/
|
||||
public SpanMultiTermQueryWrapper(Q query) {
|
||||
this.query = query;
|
||||
|
||||
MultiTermQuery.RewriteMethod method = query.getRewriteMethod();
|
||||
if (method instanceof TopTermsRewrite) {
|
||||
final int pqsize = ((TopTermsRewrite) method).getSize();
|
||||
setRewriteMethod(new TopTermsSpanBooleanQueryRewrite(pqsize));
|
||||
} else {
|
||||
setRewriteMethod(SCORING_SPAN_QUERY_REWRITE);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: returns the rewriteMethod
|
||||
*/
|
||||
public final SpanRewriteMethod getRewriteMethod() {
|
||||
final MultiTermQuery.RewriteMethod m = query.getRewriteMethod();
|
||||
if (!(m instanceof SpanRewriteMethod))
|
||||
throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod.");
|
||||
return (SpanRewriteMethod) m;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: sets the rewrite method. This only makes sense
|
||||
* to be a span rewrite method.
|
||||
*/
|
||||
public final void setRewriteMethod(SpanRewriteMethod rewriteMethod) {
|
||||
query.setRewriteMethod(rewriteMethod);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Spans getSpans(IndexReader reader) throws IOException {
|
||||
throw new UnsupportedOperationException("Query should have been rewritten");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getField() {
|
||||
return query.getField();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("SpanMultiTermQueryWrapper(");
|
||||
builder.append(query.toString(field));
|
||||
builder.append(")");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
final Query q = query.rewrite(reader);
|
||||
if (!(q instanceof SpanQuery))
|
||||
throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod.");
|
||||
return q;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return 31 * query.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
final SpanMultiTermQueryWrapper other = (SpanMultiTermQueryWrapper) obj;
|
||||
return query.equals(other.query);
|
||||
}
|
||||
|
||||
/** Abstract class that defines how the query is rewritten. */
|
||||
public static abstract class SpanRewriteMethod extends MultiTermQuery.RewriteMethod {
|
||||
@Override
|
||||
public abstract SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* A rewrite method that first translates each term into a SpanTermQuery in a
|
||||
* {@link Occur#SHOULD} clause in a BooleanQuery, and keeps the
|
||||
* scores as computed by the query.
|
||||
*
|
||||
* @see #setRewriteMethod
|
||||
*/
|
||||
public final static SpanRewriteMethod SCORING_SPAN_QUERY_REWRITE = new SpanRewriteMethod() {
|
||||
private final ScoringRewrite<SpanOrQuery> delegate = new ScoringRewrite<SpanOrQuery>() {
|
||||
@Override
|
||||
protected SpanOrQuery getTopLevelQuery() {
|
||||
return new SpanOrQuery();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost) {
|
||||
final SpanTermQuery q = new SpanTermQuery(term);
|
||||
q.setBoost(boost);
|
||||
topLevel.addClause(q);
|
||||
}
|
||||
};
|
||||
|
||||
@Override
|
||||
public SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
|
||||
return delegate.rewrite(reader, query);
|
||||
}
|
||||
|
||||
// Make sure we are still a singleton even after deserializing
|
||||
protected Object readResolve() {
|
||||
return SCORING_SPAN_QUERY_REWRITE;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* A rewrite method that first translates each term into a SpanTermQuery in a
|
||||
* {@link Occur#SHOULD} clause in a BooleanQuery, and keeps the
|
||||
* scores as computed by the query.
|
||||
*
|
||||
* <p>
|
||||
* This rewrite method only uses the top scoring terms so it will not overflow
|
||||
* the boolean max clause count.
|
||||
*
|
||||
* @see #setRewriteMethod
|
||||
*/
|
||||
public static final class TopTermsSpanBooleanQueryRewrite extends SpanRewriteMethod {
|
||||
private final TopTermsRewrite<SpanOrQuery> delegate;
|
||||
|
||||
/**
|
||||
* Create a TopTermsSpanBooleanQueryRewrite for
|
||||
* at most <code>size</code> terms.
|
||||
*/
|
||||
public TopTermsSpanBooleanQueryRewrite(int size) {
|
||||
delegate = new TopTermsRewrite<SpanOrQuery>(size) {
|
||||
@Override
|
||||
protected int getMaxSize() {
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected SpanOrQuery getTopLevelQuery() {
|
||||
return new SpanOrQuery();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost) {
|
||||
final SpanTermQuery q = new SpanTermQuery(term);
|
||||
q.setBoost(boost);
|
||||
topLevel.addClause(q);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** return the maximum priority queue size */
|
||||
public int getSize() {
|
||||
return delegate.getSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
|
||||
return delegate.rewrite(reader, query);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return 31 * delegate.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
final TopTermsSpanBooleanQueryRewrite other = (TopTermsSpanBooleanQueryRewrite) obj;
|
||||
return delegate.equals(other.delegate);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -42,16 +42,20 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
|
||||
// copy clauses array into an ArrayList
|
||||
this.clauses = new ArrayList<SpanQuery>(clauses.length);
|
||||
for (int i = 0; i < clauses.length; i++) {
|
||||
SpanQuery clause = clauses[i];
|
||||
if (i == 0) { // check field
|
||||
field = clause.getField();
|
||||
} else if (!clause.getField().equals(field)) {
|
||||
throw new IllegalArgumentException("Clauses must have same field.");
|
||||
}
|
||||
this.clauses.add(clause);
|
||||
addClause(clauses[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Adds a clause to this query */
|
||||
public final void addClause(SpanQuery clause) {
|
||||
if (field == null) {
|
||||
field = clause.getField();
|
||||
} else if (!clause.getField().equals(field)) {
|
||||
throw new IllegalArgumentException("Clauses must have same field.");
|
||||
}
|
||||
this.clauses.add(clause);
|
||||
}
|
||||
|
||||
/** Return the clauses whose spans are matched. */
|
||||
public SpanQuery[] getClauses() {
|
||||
return clauses.toArray(new SpanQuery[clauses.size()]);
|
||||
|
@ -147,8 +147,8 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
|
||||
return new TermRangeTermsEnum(reader, field, "2", "7", true, true, null) {
|
||||
final MultiTermQuery.BoostAttribute boostAtt =
|
||||
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
|
||||
final BoostAttribute boostAtt =
|
||||
attributes().addAttribute(BoostAttribute.class);
|
||||
|
||||
@Override
|
||||
protected AcceptStatus accept(BytesRef term) {
|
||||
|
@ -0,0 +1,92 @@
|
||||
package org.apache.lucene.search.spans;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Tests for {@link SpanMultiTermQueryWrapper}, wrapping a few MultiTermQueries.
|
||||
*/
|
||||
public class TestSpanMultiTermQueryWrapper extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
private Searcher searcher;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random, directory);
|
||||
Document doc = new Document();
|
||||
Field field = newField("field", "", Field.Store.NO, Field.Index.ANALYZED);
|
||||
doc.add(field);
|
||||
|
||||
field.setValue("quick brown fox");
|
||||
iw.addDocument(doc);
|
||||
field.setValue("jumps over lazy broun dog");
|
||||
iw.addDocument(doc);
|
||||
field.setValue("jumps over extremely very lazy broxn dog");
|
||||
iw.addDocument(doc);
|
||||
reader = iw.getReader();
|
||||
iw.close();
|
||||
searcher = new IndexSearcher(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
searcher.close();
|
||||
reader.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testWildcard() throws Exception {
|
||||
WildcardQuery wq = new WildcardQuery(new Term("field", "bro?n"));
|
||||
SpanQuery swq = new SpanMultiTermQueryWrapper<WildcardQuery>(wq);
|
||||
// will only match quick brown fox
|
||||
SpanFirstQuery sfq = new SpanFirstQuery(swq, 2);
|
||||
assertEquals(1, searcher.search(sfq, 10).totalHits);
|
||||
}
|
||||
|
||||
public void testFuzzy() throws Exception {
|
||||
FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"));
|
||||
SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
|
||||
// will not match quick brown fox
|
||||
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 3, 6);
|
||||
assertEquals(2, searcher.search(sprq, 10).totalHits);
|
||||
}
|
||||
|
||||
public void testFuzzy2() throws Exception {
|
||||
// maximum of 1 term expansion
|
||||
FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1f, 0, 1);
|
||||
SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
|
||||
// will only match jumps over lazy broun dog
|
||||
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100);
|
||||
assertEquals(1, searcher.search(sprq, 10).totalHits);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user