LUCENE-2754, LUCENE-2757: Added a wrapper around MultiTermQueries to add span support: SpanMultiTermQueryWrapper<Q extends MultiTermQuery>. Using this wrapper its easy to add fuzzy/wildcard to e.g. a SpanNearQuery. This patch also refactors all RewriteMethods and Attributes in MTQ (class was unmaintainable).

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1035096 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-11-14 23:13:46 +00:00
parent 18c317a1e6
commit ac71ebc237
20 changed files with 1376 additions and 762 deletions

View File

@ -716,6 +716,10 @@ New features
* LUCENE-2671: Add SortField.setMissingValue( v ) to enable sorting
behavior for documents that do not include the given field. (ryan)
* LUCENE-2754, LUCENE-2757: Added a wrapper around MultiTermQueries
to add span support: SpanMultiTermQueryWrapper<Q extends MultiTermQuery>.
Using this wrapper its easy to add fuzzy/wildcard to e.g. a SpanNearQuery.
(Robert Muir, Uwe Schindler)
Optimizations

View File

@ -177,6 +177,10 @@ API Changes
QueryNodeProcessorPipeline now implements the List interface, this is useful
if you want to extend or modify an existing pipeline. (Adriano Crestani via Robert Muir)
* LUCENE-2754, LUCENE-2757: Deprecated SpanRegexQuery. Use
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
(Robert Muir, Uwe Schindler)
New features
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.

View File

@ -201,16 +201,16 @@ public class FuzzyLikeThisQuery extends Query
float minScore=0;
Term startTerm=internSavingTemplateTerm.createTerm(term);
AttributeSource atts = new AttributeSource();
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(reader, atts, startTerm, f.minSimilarity, f.prefixLength);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants=0;
int totalVariantDocFreqs=0;
BytesRef possibleMatch;
MultiTermQuery.BoostAttribute boostAtt =
fe.attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
BoostAttribute boostAtt =
fe.attributes().addAttribute(BoostAttribute.class);
while ((possibleMatch = fe.next()) != null) {
if (possibleMatch!=null) {
numVariants++;

View File

@ -18,115 +18,29 @@ package org.apache.lucene.search.regex;
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.Collection;
import java.util.ArrayList;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
/**
* A SpanQuery version of {@link RegexQuery} allowing regular expression
* queries to be nested within other SpanQuery subclasses.
* @deprecated Use <code>new SpanMultiTermQueryWrapper&lt;RegexQuery&gt;(new RegexQuery())</code> instead.
* This query will be removed in Lucene 4.0
*/
public class SpanRegexQuery extends SpanQuery implements RegexQueryCapable {
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
private Term term;
@Deprecated
public class SpanRegexQuery extends SpanMultiTermQueryWrapper<RegexQuery> implements RegexQueryCapable {
private final RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
public SpanRegexQuery(Term term) {
this.term = term;
super(new RegexQuery(term));
}
public Term getTerm() { return term; }
@Override
public Query rewrite(IndexReader reader) throws IOException {
RegexQuery orig = new RegexQuery(term);
orig.setRegexImplementation(regexImpl);
orig.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
BooleanQuery bq = (BooleanQuery) orig.rewrite(reader);
BooleanClause[] clauses = bq.getClauses();
SpanQuery[] sqs = new SpanQuery[clauses.length];
for (int i = 0; i < clauses.length; i++) {
BooleanClause clause = clauses[i];
// Clauses from RegexQuery.rewrite are always TermQuery's
TermQuery tq = (TermQuery) clause.getQuery();
sqs[i] = new SpanTermQuery(tq.getTerm());
sqs[i].setBoost(tq.getBoost());
}
SpanOrQuery query = new SpanOrQuery(sqs);
query.setBoost(orig.getBoost());
return query;
}
@Override
public Spans getSpans(IndexReader reader) throws IOException {
throw new UnsupportedOperationException("Query should have been rewritten");
}
@Override
public String getField() {
return term.field();
}
public Collection<Term> getTerms() {
Collection<Term> terms = new ArrayList<Term>();
terms.add(term);
return terms;
}
/* generated by IntelliJ IDEA */
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final SpanRegexQuery that = (SpanRegexQuery) o;
if (!regexImpl.equals(that.regexImpl)) return false;
if (!term.equals(that.term)) return false;
return true;
}
/* generated by IntelliJ IDEA */
@Override
public int hashCode() {
int result;
result = regexImpl.hashCode();
result = 29 * result + term.hashCode();
return result;
}
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
buffer.append("spanRegexQuery(");
buffer.append(term);
buffer.append(")");
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
public Term getTerm() { return query.getTerm(); }
public void setRegexImplementation(RegexCapabilities impl) {
this.regexImpl = impl;
query.setRegexImplementation(impl);
}
public RegexCapabilities getRegexImplementation() {
return regexImpl;
return query.getRegexImplementation();
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.store.Directory;
@ -73,6 +74,65 @@ public class TestSpanRegexQuery extends LuceneTestCase {
writer.optimize();
writer.close();
IndexSearcher searcher = new IndexSearcher(directory, true);
SpanQuery srq = new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery(new Term("field", "aut.*")));
SpanFirstQuery sfq = new SpanFirstQuery(srq, 1);
// SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6,
// true);
int numHits = searcher.search(sfq, null, 1000).totalHits;
assertEquals(1, numHits);
searcher.close();
directory.close();
}
public void testSpanRegexBug() throws CorruptIndexException, IOException {
createRAMDirectories();
SpanQuery srq = new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery(new Term("field", "a.*")));
SpanQuery stq = new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery(new Term("field", "b.*")));
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] { srq, stq }, 6,
true);
// 1. Search the same store which works
IndexSearcher[] arrSearcher = new IndexSearcher[2];
arrSearcher[0] = new IndexSearcher(indexStoreA, true);
arrSearcher[1] = new IndexSearcher(indexStoreB, true);
MultiSearcher searcher = new MultiSearcher(arrSearcher);
int numHits = searcher.search(query, null, 1000).totalHits;
arrSearcher[0].close();
arrSearcher[1].close();
// Will fail here
// We expect 2 but only one matched
// The rewriter function only write it once on the first IndexSearcher
// So it's using term: a1 b1 to search on the second IndexSearcher
// As a result, it won't match the document in the second IndexSearcher
assertEquals(2, numHits);
indexStoreA.close();
indexStoreB.close();
}
/** remove in lucene 4.0 */
@Deprecated
public void testSpanRegexOld() throws Exception {
Directory directory = newDirectory();
IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer()));
Document doc = new Document();
// doc.add(newField("field", "the quick brown fox jumps over the lazy dog",
// Field.Store.NO, Field.Index.ANALYZED));
// writer.addDocument(doc);
// doc = new Document();
doc.add(newField("field", "auto update", Field.Store.NO,
Field.Index.ANALYZED));
writer.addDocument(doc);
doc = new Document();
doc.add(newField("field", "first auto update", Field.Store.NO,
Field.Index.ANALYZED));
writer.addDocument(doc);
writer.optimize();
writer.close();
IndexSearcher searcher = new IndexSearcher(directory, true);
SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "aut.*"));
SpanFirstQuery sfq = new SpanFirstQuery(srq, 1);
@ -84,7 +144,9 @@ public class TestSpanRegexQuery extends LuceneTestCase {
directory.close();
}
public void testSpanRegexBug() throws CorruptIndexException, IOException {
/** remove in lucene 4.0 */
@Deprecated
public void testSpanRegexBugOld() throws CorruptIndexException, IOException {
createRAMDirectories();
SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "a.*"));

View File

@ -28,7 +28,8 @@ import java.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
@ -389,16 +390,16 @@ public class DirectSpellChecker {
IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
AttributeSource atts = new AttributeSource();
MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum e = new FuzzyTermsEnum(ir, atts, term, editDistance, Math.max(minPrefix, editDistance-1));
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
BytesRef queryTerm = new BytesRef(term.text());
BytesRef candidateTerm;
ScoreTerm st = new ScoreTerm();
MultiTermQuery.BoostAttribute boostAtt =
e.attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
BoostAttribute boostAtt =
e.attributes().addAttribute(BoostAttribute.class);
while ((candidateTerm = e.next()) != null) {
final float boost = boostAtt.getBoost();
// ignore uncompetitive hits

View File

@ -0,0 +1,38 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource; // javadocs only
import org.apache.lucene.index.TermsEnum; // javadocs only
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}
* and update the boost on each returned term. This enables to control the boost factor
* for each matching term in {@link MultiTermQuery#SCORING_BOOLEAN_QUERY_REWRITE} or
* {@link TopTermsRewrite} mode.
* {@link FuzzyQuery} is using this to take the edit distance into account.
* <p><b>Please note:</b> This attribute is intended to be added only by the TermsEnum
* to itsself in its constructor and consumed by the {@link MultiTermQuery.RewriteMethod}.
* @lucene.internal
*/
public interface BoostAttribute extends Attribute {
/** Sets the boost in this attribute */
public void setBoost(float boost);
/** Retrieves the boost, default is {@code 1.0f}. */
public float getBoost();
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
/** Implementation class for {@link BoostAttribute}.
* @lucene.internal
*/
public final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute {
private float boost = 1.0f;
public void setBoost(float boost) {
this.boost = boost;
}
public float getBoost() {
return boost;
}
@Override
public void clear() {
boost = 1.0f;
}
@Override
public boolean equals(Object other) {
if (this == other)
return true;
if (other instanceof BoostAttributeImpl)
return ((BoostAttributeImpl) other).boost == boost;
return false;
}
@Override
public int hashCode() {
return Float.floatToIntBits(boost);
}
@Override
public void copyTo(AttributeImpl target) {
((BoostAttribute) target).setBoost(boost);
}
}

View File

@ -0,0 +1,186 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Comparator;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
// Defaults derived from rough tests with a 20.0 million
// doc Wikipedia index. With more than 350 terms in the
// query, the filter method is fastest:
public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
// If the query will hit more than 1 in 1000 of the docs
// in the index (0.1%), the filter method is fastest:
public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
/** If the number of terms in this query is equal to or
* larger than this setting then {@link
* #CONSTANT_SCORE_FILTER_REWRITE} is used. */
public void setTermCountCutoff(int count) {
termCountCutoff = count;
}
/** @see #setTermCountCutoff */
public int getTermCountCutoff() {
return termCountCutoff;
}
/** If the number of documents to be visited in the
* postings exceeds this specified percentage of the
* maxDoc() for the index, then {@link
* #CONSTANT_SCORE_FILTER_REWRITE} is used.
* @param percent 0.0 to 100.0 */
public void setDocCountPercent(double percent) {
docCountPercent = percent;
}
/** @see #setDocCountPercent */
public double getDocCountPercent() {
return docCountPercent;
}
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/) {
topLevel.add(new TermQuery(term, docFreq), BooleanClause.Occur.SHOULD);
}
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
// Get the enum and start visiting terms. If we
// exhaust the enum before hitting either of the
// cutoffs, we use ConstantBooleanQueryRewrite; else,
// ConstantFilterRewrite:
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
collectTerms(reader, query, col);
final int size = col.pendingTerms.size();
if (col.hasCutOff) {
return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
} else if (size == 0) {
return getTopLevelQuery();
} else {
final BooleanQuery bq = getTopLevelQuery();
final Term placeholderTerm = new Term(query.field);
final BytesRefHash pendingTerms = col.pendingTerms;
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
for(int i = 0; i < size; i++) {
// docFreq is not used for constant score here, we pass 1
// to explicitely set a fake value, so it's not calculated
addClause(bq, placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1, 1.0f);
}
// Strip scores
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
result.setBoost(query.getBoost());
query.incTotalNumberOfTerms(size);
return result;
}
}
static final class CutOffTermCollector extends TermCollector {
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
this.docCountCutoff = docCountCutoff;
this.termCountLimit = termCountLimit;
}
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
pendingTerms.add(bytes);
docVisitCount += termsEnum.docFreq();
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
return true;
}
int docVisitCount = 0;
boolean hasCutOff = false;
TermsEnum termsEnum;
final int docCountCutoff, termCountLimit;
final BytesRefHash pendingTerms = new BytesRefHash();
}
@Override
public int hashCode() {
final int prime = 1279;
return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent));
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj;
if (other.termCountCutoff != termCountCutoff) {
return false;
}
if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) {
return false;
}
return true;
}
}

View File

@ -49,12 +49,12 @@ import java.util.List;
*/
public final class FuzzyTermsEnum extends TermsEnum {
private TermsEnum actualEnum;
private MultiTermQuery.BoostAttribute actualBoostAtt;
private BoostAttribute actualBoostAtt;
private final MultiTermQuery.BoostAttribute boostAtt =
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
private final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
private final MultiTermQuery.MaxNonCompetitiveBoostAttribute maxBoostAtt;
private final MaxNonCompetitiveBoostAttribute maxBoostAtt;
private final LevenshteinAutomataAttribute dfaAtt;
private float bottom;
@ -128,7 +128,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
}
this.scale_factor = 1.0f / (1.0f - this.minSimilarity);
this.maxBoostAtt = atts.addAttribute(MultiTermQuery.MaxNonCompetitiveBoostAttribute.class);
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
bottomTerm = maxBoostAtt.getCompetitiveTerm();
bottomChanged(null, true);
@ -174,8 +174,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
/** swap in a new actual enum to proxy to */
private void setEnum(TermsEnum actualEnum) {
this.actualEnum = actualEnum;
this.actualBoostAtt = actualEnum.attributes().addAttribute(
MultiTermQuery.BoostAttribute.class);
this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class);
}
/**
@ -300,8 +299,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
private final BytesRef termRef;
private final BytesRef lastTerm;
private final MultiTermQuery.BoostAttribute boostAtt =
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
private final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
public AutomatonFuzzyTermsEnum(ByteRunAutomaton matchers[],
BytesRef lastTerm) throws IOException {
@ -363,8 +362,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
// this is the text, minus the prefix
private final int[] text;
private final MultiTermQuery.BoostAttribute boostAtt =
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
private final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
/**
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of

View File

@ -0,0 +1,45 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource; // javadocs only
import org.apache.lucene.util.BytesRef;
/** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling
* {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}.
* {@link FuzzyQuery} is using this to control its internal behaviour
* to only return competitive terms.
* <p><b>Please note:</b> This attribute is intended to be added by the {@link MultiTermQuery.RewriteMethod}
* to an empty {@link AttributeSource} that is shared for all segments
* during query rewrite. This attribute source is passed to all segment enums
* on {@link MultiTermQuery#getTermsEnum(IndexReader,AttributeSource)}.
* {@link TopTermsRewrite} uses this attribute to
* inform all enums about the current boost, that is not competitive.
* @lucene.internal
*/
public interface MaxNonCompetitiveBoostAttribute extends Attribute {
/** This is the maximum boost that would not be competitive. */
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
/** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */
public float getMaxNonCompetitiveBoost();
/** This is the term or <code>null</code> of the term that triggered the boost change. */
public void setCompetitiveTerm(BytesRef competitiveTerm);
/** This is the term or <code>null</code> of the term that triggered the boost change. Default is <code>null</code>, which means every term is competitoive. */
public BytesRef getCompetitiveTerm();
}

View File

@ -0,0 +1,78 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
/** Implementation class for {@link MaxNonCompetitiveBoostAttribute}.
* @lucene.internal
*/
public final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute {
private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
private BytesRef competitiveTerm = null;
public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) {
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
}
public float getMaxNonCompetitiveBoost() {
return maxNonCompetitiveBoost;
}
public void setCompetitiveTerm(final BytesRef competitiveTerm) {
this.competitiveTerm = competitiveTerm;
}
public BytesRef getCompetitiveTerm() {
return competitiveTerm;
}
@Override
public void clear() {
maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
competitiveTerm = null;
}
@Override
public boolean equals(Object other) {
if (this == other)
return true;
if (other instanceof MaxNonCompetitiveBoostAttributeImpl) {
final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other;
return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost)
&& (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm));
}
return false;
}
@Override
public int hashCode() {
int hash = Float.floatToIntBits(maxNonCompetitiveBoost);
if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode();
return hash;
}
@Override
public void copyTo(AttributeImpl target) {
final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target;
t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
t.setCompetitiveTerm(competitiveTerm);
}
}

View File

@ -19,29 +19,12 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Comparator;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
/**
* An abstract {@link Query} that matches documents
@ -80,156 +63,12 @@ public abstract class MultiTermQuery extends Query {
protected final String field;
protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
transient int numberOfTerms = 0;
/** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum(IndexReader,AttributeSource)}
* and update the boost on each returned term. This enables to control the boost factor
* for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or
* {@link TopTermsBooleanQueryRewrite} mode.
* {@link FuzzyQuery} is using this to take the edit distance into account.
* <p><b>Please note:</b> This attribute is intended to be added only by the TermsEnum
* to itsself in its constructor and consumed by the {@link RewriteMethod}.
* @lucene.internal
*/
public static interface BoostAttribute extends Attribute {
/** Sets the boost in this attribute */
public void setBoost(float boost);
/** Retrieves the boost, default is {@code 1.0f}. */
public float getBoost();
}
/** Implementation class for {@link BoostAttribute}. */
public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute {
private float boost = 1.0f;
public void setBoost(float boost) {
this.boost = boost;
}
public float getBoost() {
return boost;
}
@Override
public void clear() {
boost = 1.0f;
}
@Override
public boolean equals(Object other) {
if (this == other)
return true;
if (other instanceof BoostAttributeImpl)
return ((BoostAttributeImpl) other).boost == boost;
return false;
}
@Override
public int hashCode() {
return Float.floatToIntBits(boost);
}
@Override
public void copyTo(AttributeImpl target) {
((BoostAttribute) target).setBoost(boost);
}
}
/** Add this {@link Attribute} to a fresh {@link AttributeSource} before calling
* {@link #getTermsEnum(IndexReader,AttributeSource)}.
* {@link FuzzyQuery} is using this to control its internal behaviour
* to only return competitive terms.
* <p><b>Please note:</b> This attribute is intended to be added by the {@link RewriteMethod}
* to an empty {@link AttributeSource} that is shared for all segments
* during query rewrite. This attribute source is passed to all segment enums
* on {@link #getTermsEnum(IndexReader,AttributeSource)}.
* {@link TopTermsBooleanQueryRewrite} uses this attribute to
* inform all enums about the current boost, that is not competitive.
* @lucene.internal
*/
public static interface MaxNonCompetitiveBoostAttribute extends Attribute {
/** This is the maximum boost that would not be competitive. */
public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost);
/** This is the maximum boost that would not be competitive. Default is negative infinity, which means every term is competitive. */
public float getMaxNonCompetitiveBoost();
/** This is the term or <code>null<code> of the term that triggered the boost change. */
public void setCompetitiveTerm(BytesRef competitiveTerm);
/** This is the term or <code>null<code> of the term that triggered the boost change. Default is <code>null</code>, which means every term is competitoive. */
public BytesRef getCompetitiveTerm();
}
/** Implementation class for {@link MaxNonCompetitiveBoostAttribute}. */
public static final class MaxNonCompetitiveBoostAttributeImpl extends AttributeImpl implements MaxNonCompetitiveBoostAttribute {
private float maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
private BytesRef competitiveTerm = null;
public void setMaxNonCompetitiveBoost(final float maxNonCompetitiveBoost) {
this.maxNonCompetitiveBoost = maxNonCompetitiveBoost;
}
public float getMaxNonCompetitiveBoost() {
return maxNonCompetitiveBoost;
}
public void setCompetitiveTerm(final BytesRef competitiveTerm) {
this.competitiveTerm = competitiveTerm;
}
public BytesRef getCompetitiveTerm() {
return competitiveTerm;
}
@Override
public void clear() {
maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY;
competitiveTerm = null;
}
@Override
public boolean equals(Object other) {
if (this == other)
return true;
if (other instanceof MaxNonCompetitiveBoostAttributeImpl) {
final MaxNonCompetitiveBoostAttributeImpl o = (MaxNonCompetitiveBoostAttributeImpl) other;
return (o.maxNonCompetitiveBoost == maxNonCompetitiveBoost)
&& (o.competitiveTerm == null ? competitiveTerm == null : o.competitiveTerm.equals(competitiveTerm));
}
return false;
}
@Override
public int hashCode() {
int hash = Float.floatToIntBits(maxNonCompetitiveBoost);
if (competitiveTerm != null) hash = 31 * hash + competitiveTerm.hashCode();
return hash;
}
@Override
public void copyTo(AttributeImpl target) {
final MaxNonCompetitiveBoostAttributeImpl t = (MaxNonCompetitiveBoostAttributeImpl) target;
t.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
t.setCompetitiveTerm(competitiveTerm);
}
}
/** Abstract class that defines how the query is rewritten. */
public static abstract class RewriteMethod implements Serializable {
public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException;
}
private static final class ConstantScoreFilterRewrite extends RewriteMethod {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) {
Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
result.setBoost(query.getBoost());
return result;
}
// Make sure we are still a singleton even after deserializing
protected Object readResolve() {
return CONSTANT_SCORE_FILTER_REWRITE;
}
}
/** A rewrite method that first creates a private Filter,
* by visiting each term in sequence and marking all docs
* for that term. Matching documents are assigned a
@ -242,162 +81,19 @@ public abstract class MultiTermQuery extends Query {
* exception.
*
* @see #setRewriteMethod */
public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite();
private abstract static class BooleanQueryRewrite extends RewriteMethod {
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
ReaderUtil.gatherSubReaders(subReaders, reader);
Comparator<BytesRef> lastTermComp = null;
for (IndexReader r : subReaders) {
final Fields fields = r.fields();
if (fields == null) {
// reader has no fields
continue;
}
final Terms terms = fields.terms(query.field);
if (terms == null) {
// field does not exist
continue;
}
final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes);
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY)
continue;
// Check comparator compatibility:
final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
if (lastTermComp != null && newTermComp != lastTermComp)
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
lastTermComp = newTermComp;
collector.setNextEnum(termsEnum);
BytesRef bytes;
while ((bytes = termsEnum.next()) != null) {
termsEnum.cacheCurrentTerm();
if (!collector.collect(bytes))
return; // interrupt whole term collection, so also don't iterate other subReaders
}
}
}
protected static abstract class TermCollector {
/** attributes used for communication with the enum */
public final AttributeSource attributes = new AttributeSource();
/** return false to stop collecting */
public abstract boolean collect(BytesRef bytes) throws IOException;
/** the next segment's {@link TermsEnum} that is used to collect terms */
public abstract void setNextEnum(TermsEnum termsEnum) throws IOException;
}
}
private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite {
public static final RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new RewriteMethod() {
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
collectTerms(reader, query, col);
final Term placeholderTerm = new Term(query.field);
final BooleanQuery result = new BooleanQuery(true);
final int size = col.terms.size();
if (size > 0) {
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
final int[] docFreq = col.array.docFreq;
final float[] boost = col.array.boost;
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
assert reader.docFreq(term) == docFreq[pos];
final TermQuery tq = new TermQuery(term, docFreq[pos]);
tq.setBoost(query.getBoost() * boost[pos]);
result.add(tq, BooleanClause.Occur.SHOULD);
}
}
query.incTotalNumberOfTerms(size);
public Query rewrite(IndexReader reader, MultiTermQuery query) {
Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
result.setBoost(query.getBoost());
return result;
}
// Make sure we are still a singleton even after deserializing
protected Object readResolve() {
return SCORING_BOOLEAN_QUERY_REWRITE;
return CONSTANT_SCORE_FILTER_REWRITE;
}
static final class ParallelArraysTermCollector extends TermCollector {
final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16);
final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
TermsEnum termsEnum;
private BoostAttribute boostAtt;
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@Override
public boolean collect(BytesRef bytes) {
final int e = terms.add(bytes);
if (e < 0 ) {
// duplicate term: update docFreq
final int pos = (-e)-1;
array.docFreq[pos] += termsEnum.docFreq();
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
} else {
// new entry: we populate the entry initially
array.docFreq[e] = termsEnum.docFreq();
array.boost[e] = boostAtt.getBoost();
}
// if the new entry reaches the max clause count, we exit early
if (e >= BooleanQuery.getMaxClauseCount())
throw new BooleanQuery.TooManyClauses();
return true;
}
}
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
int[] docFreq;
float[] boost;
public TermFreqBoostByteStart(int initSize) {
super(initSize);
}
@Override
public int[] init() {
final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
assert boost.length >= ord.length && docFreq.length >= ord.length;
return ord;
}
@Override
public int[] grow() {
final int[] ord = super.grow();
docFreq = ArrayUtil.grow(docFreq, ord.length);
boost = ArrayUtil.grow(boost, ord.length);
assert boost.length >= ord.length && docFreq.length >= ord.length;
return ord;
}
@Override
public int[] clear() {
boost = null;
docFreq = null;
return super.clear();
}
}
}
};
/** A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a
@ -412,155 +108,19 @@ public abstract class MultiTermQuery extends Query {
* exceeds {@link BooleanQuery#getMaxClauseCount}.
*
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
/**
* Base rewrite method for collecting only the top terms
* via a priority queue.
*/
public static abstract class TopTermsBooleanQueryRewrite extends BooleanQueryRewrite {
private final int size;
/**
* Create a TopTermsBooleanQueryRewrite for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsBooleanQueryRewrite(int size) {
this.size = size;
}
/** Return a suitable Query for a MultiTermQuery term. */
protected abstract Query getQuery(Term term, int docCount);
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
collectTerms(reader, query, new TermCollector() {
private final MaxNonCompetitiveBoostAttribute maxBoostAtt =
attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
private final Map<BytesRef,ScoreTerm> visitedTerms = new HashMap<BytesRef,ScoreTerm>();
private TermsEnum termsEnum;
private Comparator<BytesRef> termComp;
private BoostAttribute boostAtt;
private ScoreTerm st;
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
this.termComp = termsEnum.getComparator();
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(this.termComp);
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@Override
public boolean collect(BytesRef bytes) {
final float boost = boostAtt.getBoost();
// ignore uncompetetive hits
if (stQueue.size() == maxSize) {
final ScoreTerm t = stQueue.peek();
if (boost < t.boost)
return true;
if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
return true;
}
ScoreTerm t = visitedTerms.get(bytes);
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
t.docFreq += termsEnum.docFreq();
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes);
st.boost = boost;
st.docFreq = termsEnum.docFreq();
visitedTerms.put(st.bytes, st);
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
st = stQueue.poll();
visitedTerms.remove(st.bytes);
} else {
st = new ScoreTerm(termComp);
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
if (stQueue.size() == maxSize) {
t = stQueue.peek();
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
maxBoostAtt.setCompetitiveTerm(t.bytes);
}
}
return true;
}
});
final Term placeholderTerm = new Term(query.field);
final BooleanQuery bq = new BooleanQuery(true);
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp);
for (final ScoreTerm st : scoreTerms) {
final Term term = placeholderTerm.createTerm(st.bytes);
assert reader.docFreq(term) == st.docFreq;
Query tq = getQuery(term, st.docFreq);
tq.setBoost(query.getBoost() * st.boost); // set the boost
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
}
query.incTotalNumberOfTerms(scoreTerms.length);
return bq;
}
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = ScoringRewrite.SCORING_BOOLEAN_QUERY_REWRITE;
@Override
public int hashCode() {
return 31 * size;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
TopTermsBooleanQueryRewrite other = (TopTermsBooleanQueryRewrite) obj;
if (size != other.size) return false;
return true;
}
private static final Comparator<ScoreTerm> scoreTermSortByTermComp =
new Comparator<ScoreTerm>() {
public int compare(ScoreTerm st1, ScoreTerm st2) {
assert st1.termComp == st2.termComp :
"term comparator should not change between segments";
return st1.termComp.compare(st1.bytes, st2.bytes);
}
};
static final class ScoreTerm implements Comparable<ScoreTerm> {
public final Comparator<BytesRef> termComp;
public final BytesRef bytes = new BytesRef();
public float boost;
public int docFreq;
public ScoreTerm(Comparator<BytesRef> termComp) {
this.termComp = termComp;
}
public int compareTo(ScoreTerm other) {
if (this.boost == other.boost)
return termComp.compare(other.bytes, this.bytes);
else
return Float.compare(this.boost, other.boost);
}
}
}
/** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except
* scores are not computed. Instead, each matching
* document receives a constant score equal to the
* query's boost.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* BooleanQuery.TooManyClauses} if the number of terms
* exceeds {@link BooleanQuery#getMaxClauseCount}.
*
* @see #setRewriteMethod */
public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = ScoringRewrite.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
/**
* A rewrite method that first translates each term into
@ -574,8 +134,7 @@ public abstract class MultiTermQuery extends Query {
*
* @see #setRewriteMethod
*/
public static final class TopTermsScoringBooleanQueryRewrite extends
TopTermsBooleanQueryRewrite {
public static final class TopTermsScoringBooleanQueryRewrite extends TopTermsRewrite<BooleanQuery> {
/**
* Create a TopTermsScoringBooleanQueryRewrite for
@ -589,8 +148,20 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected Query getQuery(Term term, int docFreq) {
return new TermQuery(term, docFreq);
protected int getMaxSize() {
return BooleanQuery.getMaxClauseCount();
}
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
final TermQuery tq = new TermQuery(term, docCount);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
}
}
@ -604,8 +175,7 @@ public abstract class MultiTermQuery extends Query {
*
* @see #setRewriteMethod
*/
public static final class TopTermsBoostOnlyBooleanQueryRewrite extends
TopTermsBooleanQueryRewrite {
public static final class TopTermsBoostOnlyBooleanQueryRewrite extends TopTermsRewrite<BooleanQuery> {
/**
* Create a TopTermsBoostOnlyBooleanQueryRewrite for
@ -619,45 +189,23 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected Query getQuery(Term term, int docFreq) {
return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq)));
protected int getMaxSize() {
return BooleanQuery.getMaxClauseCount();
}
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost) {
final Query q = new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term, docFreq)));
q.setBoost(boost);
topLevel.add(q, BooleanClause.Occur.SHOULD);
}
}
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
Query result = super.rewrite(reader, query);
assert result instanceof BooleanQuery;
// TODO: if empty boolean query return NullQuery?
if (!((BooleanQuery) result).clauses().isEmpty()) {
// strip the scores off
result = new ConstantScoreQuery(new QueryWrapperFilter(result));
result.setBoost(query.getBoost());
}
return result;
}
// Make sure we are still a singleton even after deserializing
@Override
protected Object readResolve() {
return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
}
}
/** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except
* scores are not computed. Instead, each matching
* document receives a constant score equal to the
* query's boost.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* BooleanQuery.TooManyClauses} if the number of terms
* exceeds {@link BooleanQuery#getMaxClauseCount}.
*
* @see #setRewriteMethod */
public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite();
/** A rewrite method that tries to pick the best
* constant-score rewrite method based on term and
* document counts from the query. If both the number of
@ -666,140 +214,7 @@ public abstract class MultiTermQuery extends Query {
* Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is
* used.
*/
public static class ConstantScoreAutoRewrite extends BooleanQueryRewrite {
// Defaults derived from rough tests with a 20.0 million
// doc Wikipedia index. With more than 350 terms in the
// query, the filter method is fastest:
public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
// If the query will hit more than 1 in 1000 of the docs
// in the index (0.1%), the filter method is fastest:
public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
/** If the number of terms in this query is equal to or
* larger than this setting then {@link
* #CONSTANT_SCORE_FILTER_REWRITE} is used. */
public void setTermCountCutoff(int count) {
termCountCutoff = count;
}
/** @see #setTermCountCutoff */
public int getTermCountCutoff() {
return termCountCutoff;
}
/** If the number of documents to be visited in the
* postings exceeds this specified percentage of the
* maxDoc() for the index, then {@link
* #CONSTANT_SCORE_FILTER_REWRITE} is used.
* @param percent 0.0 to 100.0 */
public void setDocCountPercent(double percent) {
docCountPercent = percent;
}
/** @see #setDocCountPercent */
public double getDocCountPercent() {
return docCountPercent;
}
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
// Get the enum and start visiting terms. If we
// exhaust the enum before hitting either of the
// cutoffs, we use ConstantBooleanQueryRewrite; else,
// ConstantFilterRewrite:
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
collectTerms(reader, query, col);
final int size = col.pendingTerms.size();
if (col.hasCutOff) {
return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
} else if (size == 0) {
return new BooleanQuery(true);
} else {
final BooleanQuery bq = new BooleanQuery(true);
final Term placeholderTerm = new Term(query.field);
final BytesRefHash pendingTerms = col.pendingTerms;
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
for(int i = 0; i < size; i++) {
// docFreq is not used for constant score here, we pass 1
// to explicitely set a fake value, so it's not calculated
bq.add(new TermQuery(
placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1
), BooleanClause.Occur.SHOULD);
}
// Strip scores
final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
result.setBoost(query.getBoost());
query.incTotalNumberOfTerms(size);
return result;
}
}
static final class CutOffTermCollector extends TermCollector {
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
this.docCountCutoff = docCountCutoff;
this.termCountLimit = termCountLimit;
}
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
pendingTerms.add(bytes);
docVisitCount += termsEnum.docFreq();
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
return true;
}
int docVisitCount = 0;
boolean hasCutOff = false;
TermsEnum termsEnum;
final int docCountCutoff, termCountLimit;
final BytesRefHash pendingTerms = new BytesRefHash();
}
@Override
public int hashCode() {
final int prime = 1279;
return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent));
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj;
if (other.termCountCutoff != termCountCutoff) {
return false;
}
if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) {
return false;
}
return true;
}
}
public static class ConstantScoreAutoRewrite extends org.apache.lucene.search.ConstantScoreAutoRewrite {}
/** Read-only default instance of {@link
* ConstantScoreAutoRewrite}, with {@link
@ -851,7 +266,7 @@ public abstract class MultiTermQuery extends Query {
* positioned to the first matching term.
* The given {@link AttributeSource} is passed by the {@link RewriteMethod} to
* provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts.
* This is currently only used by {@link TopTermsBooleanQueryRewrite}
* This is currently only used by {@link TopTermsRewrite}
*/
protected abstract TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException;

View File

@ -0,0 +1,203 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Comparator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
/** @lucene.internal Only public to be accessible by spans package. */
public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewrite<Q> {
/** A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a
* BooleanQuery, and keeps the scores as computed by the
* query. Note that typically such scores are
* meaningless to the user, and require non-trivial CPU
* to compute, so it's almost always better to use {@link
* MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* BooleanQuery.TooManyClauses} if the number of terms
* exceeds {@link BooleanQuery#getMaxClauseCount}.
*
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringRewrite<BooleanQuery>() {
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
final TermQuery tq = new TermQuery(term, docCount);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
}
// Make sure we are still a singleton even after deserializing
protected Object readResolve() {
return SCORING_BOOLEAN_QUERY_REWRITE;
}
};
/** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except
* scores are not computed. Instead, each matching
* document receives a constant score equal to the
* query's boost.
*
* <p><b>NOTE</b>: This rewrite method will hit {@link
* BooleanQuery.TooManyClauses} if the number of terms
* exceeds {@link BooleanQuery#getMaxClauseCount}.
*
* @see #setRewriteMethod */
public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new RewriteMethod() {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
Query result = SCORING_BOOLEAN_QUERY_REWRITE.rewrite(reader, query);
assert result instanceof BooleanQuery;
// TODO: if empty boolean query return NullQuery?
if (!((BooleanQuery) result).clauses().isEmpty()) {
// strip the scores off
result = new ConstantScoreQuery(new QueryWrapperFilter(result));
result.setBoost(query.getBoost());
}
return result;
}
// Make sure we are still a singleton even after deserializing
protected Object readResolve() {
return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
}
};
@Override
public final Q rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final Q result = getTopLevelQuery();
final ParallelArraysTermCollector col = new ParallelArraysTermCollector(result instanceof BooleanQuery);
collectTerms(reader, query, col);
final Term placeholderTerm = new Term(query.field);
final int size = col.terms.size();
if (size > 0) {
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
final int[] docFreq = col.array.docFreq;
final float[] boost = col.array.boost;
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
assert reader.docFreq(term) == docFreq[pos];
addClause(result, term, docFreq[pos], query.getBoost() * boost[pos]);
}
}
query.incTotalNumberOfTerms(size);
return result;
}
static final class ParallelArraysTermCollector extends TermCollector {
private final boolean checkMaxClauseCount;
final TermFreqBoostByteStart array = new TermFreqBoostByteStart(16);
final BytesRefHash terms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
TermsEnum termsEnum;
private BoostAttribute boostAtt;
public ParallelArraysTermCollector(boolean checkMaxClauseCount) {
this.checkMaxClauseCount = checkMaxClauseCount;
}
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@Override
public boolean collect(BytesRef bytes) {
final int e = terms.add(bytes);
if (e < 0 ) {
// duplicate term: update docFreq
final int pos = (-e)-1;
array.docFreq[pos] += termsEnum.docFreq();
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
} else {
// new entry: we populate the entry initially
array.docFreq[e] = termsEnum.docFreq();
array.boost[e] = boostAtt.getBoost();
}
// if the new entry reaches the max clause count, we exit early
if (checkMaxClauseCount && e >= BooleanQuery.getMaxClauseCount())
throw new BooleanQuery.TooManyClauses();
return true;
}
}
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
int[] docFreq;
float[] boost;
public TermFreqBoostByteStart(int initSize) {
super(initSize);
}
@Override
public int[] init() {
final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
assert boost.length >= ord.length && docFreq.length >= ord.length;
return ord;
}
@Override
public int[] grow() {
final int[] ord = super.grow();
docFreq = ArrayUtil.grow(docFreq, ord.length);
boost = ArrayUtil.grow(boost, ord.length);
assert boost.length >= ord.length && docFreq.length >= ord.length;
return ord;
}
@Override
public int[] clear() {
boost = null;
docFreq = null;
return super.clear();
}
}
}

View File

@ -0,0 +1,93 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Comparator;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ReaderUtil;
abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod {
/** Return a suitable top-level Query for holding all expanded terms. */
protected abstract Q getTopLevelQuery() throws IOException;
/** Add a MultiTermQuery term to the top-level query */
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost) throws IOException;
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
ReaderUtil.gatherSubReaders(subReaders, reader);
Comparator<BytesRef> lastTermComp = null;
for (IndexReader r : subReaders) {
final Fields fields = r.fields();
if (fields == null) {
// reader has no fields
continue;
}
final Terms terms = fields.terms(query.field);
if (terms == null) {
// field does not exist
continue;
}
final TermsEnum termsEnum = query.getTermsEnum(r, collector.attributes);
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY)
continue;
// Check comparator compatibility:
final Comparator<BytesRef> newTermComp = termsEnum.getComparator();
if (lastTermComp != null && newTermComp != lastTermComp)
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
lastTermComp = newTermComp;
collector.setNextEnum(termsEnum);
BytesRef bytes;
while ((bytes = termsEnum.next()) != null) {
termsEnum.cacheCurrentTerm();
if (!collector.collect(bytes))
return; // interrupt whole term collection, so also don't iterate other subReaders
}
}
}
protected static abstract class TermCollector {
/** attributes used for communication with the enum */
public final AttributeSource attributes = new AttributeSource();
/** return false to stop collecting */
public abstract boolean collect(BytesRef bytes) throws IOException;
/** the next segment's {@link TermsEnum} that is used to collect terms */
public abstract void setNextEnum(TermsEnum termsEnum) throws IOException;
}
}

View File

@ -0,0 +1,182 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Comparator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
/**
* Base rewrite method for collecting only the top terms
* via a priority queue.
* @lucene.internal Only public to be accessible by spans package.
*/
public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRewrite<Q> {
private final int size;
/**
* Create a TopTermsBooleanQueryRewrite for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsRewrite(int size) {
this.size = size;
}
/** return the maximum priority queue size */
public int getSize() {
return size;
}
/** return the maximum size of the priority queue (for boolean rewrites this is BooleanQuery#getMaxClauseCount). */
protected abstract int getMaxSize();
@Override
public final Q rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, getMaxSize());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
collectTerms(reader, query, new TermCollector() {
private final MaxNonCompetitiveBoostAttribute maxBoostAtt =
attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
private final Map<BytesRef,ScoreTerm> visitedTerms = new HashMap<BytesRef,ScoreTerm>();
private TermsEnum termsEnum;
private Comparator<BytesRef> termComp;
private BoostAttribute boostAtt;
private ScoreTerm st;
@Override
public void setNextEnum(TermsEnum termsEnum) throws IOException {
this.termsEnum = termsEnum;
this.termComp = termsEnum.getComparator();
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(this.termComp);
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@Override
public boolean collect(BytesRef bytes) {
final float boost = boostAtt.getBoost();
// ignore uncompetetive hits
if (stQueue.size() == maxSize) {
final ScoreTerm t = stQueue.peek();
if (boost < t.boost)
return true;
if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
return true;
}
ScoreTerm t = visitedTerms.get(bytes);
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
t.docFreq += termsEnum.docFreq();
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes);
st.boost = boost;
st.docFreq = termsEnum.docFreq();
visitedTerms.put(st.bytes, st);
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
st = stQueue.poll();
visitedTerms.remove(st.bytes);
} else {
st = new ScoreTerm(termComp);
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
if (stQueue.size() == maxSize) {
t = stQueue.peek();
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
maxBoostAtt.setCompetitiveTerm(t.bytes);
}
}
return true;
}
});
final Term placeholderTerm = new Term(query.field);
final Q q = getTopLevelQuery();
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp);
for (final ScoreTerm st : scoreTerms) {
final Term term = placeholderTerm.createTerm(st.bytes);
assert reader.docFreq(term) == st.docFreq;
addClause(q, term, st.docFreq, query.getBoost() * st.boost); // add to query
}
query.incTotalNumberOfTerms(scoreTerms.length);
return q;
}
@Override
public int hashCode() {
return 31 * size;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
final TopTermsRewrite other = (TopTermsRewrite) obj;
if (size != other.size) return false;
return true;
}
private static final Comparator<ScoreTerm> scoreTermSortByTermComp =
new Comparator<ScoreTerm>() {
public int compare(ScoreTerm st1, ScoreTerm st2) {
assert st1.termComp == st2.termComp :
"term comparator should not change between segments";
return st1.termComp.compare(st1.bytes, st2.bytes);
}
};
static final class ScoreTerm implements Comparable<ScoreTerm> {
public final Comparator<BytesRef> termComp;
public final BytesRef bytes = new BytesRef();
public float boost;
public int docFreq;
public ScoreTerm(Comparator<BytesRef> termComp) {
this.termComp = termComp;
}
public int compareTo(ScoreTerm other) {
if (this.boost == other.boost)
return termComp.compare(other.bytes, this.bytes);
else
return Float.compare(this.boost, other.boost);
}
}
}

View File

@ -0,0 +1,234 @@
package org.apache.lucene.search.spans;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopTermsRewrite;
import org.apache.lucene.search.ScoringRewrite;
import org.apache.lucene.search.BooleanClause.Occur; // javadocs only
/**
* Wraps any {@link MultiTermQuery} as a {@link SpanQuery},
* so it can be nested within other SpanQuery classes.
* <p>
* The query is rewritten by default to a {@link SpanOrQuery} containing
* the expanded terms, but this can be customized.
* <p>
* Example:
* <blockquote><pre>
* {@code
* WildcardQuery wildcard = new WildcardQuery(new Term("field", "bro?n"));
* SpanQuery spanWildcard = new SpanMultiTermQueryWrapper<WildcardQuery>(wildcard);
* // do something with spanWildcard, such as use it in a SpanFirstQuery
* }
* </pre></blockquote>
*/
public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQuery {
protected final Q query;
/**
* Create a new SpanMultiTermQueryWrapper.
*
* @param query Query to wrap.
* <p>
* NOTE: This will call {@link MultiTermQuery#setRewriteMethod(MultiTermQuery.RewriteMethod)}
* on the wrapped <code>query</code>, changing its rewrite method to a suitable one for spans.
* Be sure to not change the rewrite method on the wrapped query afterwards! Doing so will
* throw {@link UnsupportedOperationException} on rewriting this query!
*/
public SpanMultiTermQueryWrapper(Q query) {
this.query = query;
MultiTermQuery.RewriteMethod method = query.getRewriteMethod();
if (method instanceof TopTermsRewrite) {
final int pqsize = ((TopTermsRewrite) method).getSize();
setRewriteMethod(new TopTermsSpanBooleanQueryRewrite(pqsize));
} else {
setRewriteMethod(SCORING_SPAN_QUERY_REWRITE);
}
}
/**
* Expert: returns the rewriteMethod
*/
public final SpanRewriteMethod getRewriteMethod() {
final MultiTermQuery.RewriteMethod m = query.getRewriteMethod();
if (!(m instanceof SpanRewriteMethod))
throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod.");
return (SpanRewriteMethod) m;
}
/**
* Expert: sets the rewrite method. This only makes sense
* to be a span rewrite method.
*/
public final void setRewriteMethod(SpanRewriteMethod rewriteMethod) {
query.setRewriteMethod(rewriteMethod);
}
@Override
public Spans getSpans(IndexReader reader) throws IOException {
throw new UnsupportedOperationException("Query should have been rewritten");
}
@Override
public String getField() {
return query.getField();
}
@Override
public String toString(String field) {
StringBuilder builder = new StringBuilder();
builder.append("SpanMultiTermQueryWrapper(");
builder.append(query.toString(field));
builder.append(")");
return builder.toString();
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
final Query q = query.rewrite(reader);
if (!(q instanceof SpanQuery))
throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod.");
return q;
}
@Override
public int hashCode() {
return 31 * query.hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
final SpanMultiTermQueryWrapper other = (SpanMultiTermQueryWrapper) obj;
return query.equals(other.query);
}
/** Abstract class that defines how the query is rewritten. */
public static abstract class SpanRewriteMethod extends MultiTermQuery.RewriteMethod {
@Override
public abstract SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException;
}
/**
* A rewrite method that first translates each term into a SpanTermQuery in a
* {@link Occur#SHOULD} clause in a BooleanQuery, and keeps the
* scores as computed by the query.
*
* @see #setRewriteMethod
*/
public final static SpanRewriteMethod SCORING_SPAN_QUERY_REWRITE = new SpanRewriteMethod() {
private final ScoringRewrite<SpanOrQuery> delegate = new ScoringRewrite<SpanOrQuery>() {
@Override
protected SpanOrQuery getTopLevelQuery() {
return new SpanOrQuery();
}
@Override
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost) {
final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost);
topLevel.addClause(q);
}
};
@Override
public SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
return delegate.rewrite(reader, query);
}
// Make sure we are still a singleton even after deserializing
protected Object readResolve() {
return SCORING_SPAN_QUERY_REWRITE;
}
};
/**
* A rewrite method that first translates each term into a SpanTermQuery in a
* {@link Occur#SHOULD} clause in a BooleanQuery, and keeps the
* scores as computed by the query.
*
* <p>
* This rewrite method only uses the top scoring terms so it will not overflow
* the boolean max clause count.
*
* @see #setRewriteMethod
*/
public static final class TopTermsSpanBooleanQueryRewrite extends SpanRewriteMethod {
private final TopTermsRewrite<SpanOrQuery> delegate;
/**
* Create a TopTermsSpanBooleanQueryRewrite for
* at most <code>size</code> terms.
*/
public TopTermsSpanBooleanQueryRewrite(int size) {
delegate = new TopTermsRewrite<SpanOrQuery>(size) {
@Override
protected int getMaxSize() {
return Integer.MAX_VALUE;
}
@Override
protected SpanOrQuery getTopLevelQuery() {
return new SpanOrQuery();
}
@Override
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost) {
final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost);
topLevel.addClause(q);
}
};
}
/** return the maximum priority queue size */
public int getSize() {
return delegate.getSize();
}
@Override
public SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
return delegate.rewrite(reader, query);
}
@Override
public int hashCode() {
return 31 * delegate.hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
final TopTermsSpanBooleanQueryRewrite other = (TopTermsSpanBooleanQueryRewrite) obj;
return delegate.equals(other.delegate);
}
}
}

View File

@ -42,16 +42,20 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
// copy clauses array into an ArrayList
this.clauses = new ArrayList<SpanQuery>(clauses.length);
for (int i = 0; i < clauses.length; i++) {
SpanQuery clause = clauses[i];
if (i == 0) { // check field
field = clause.getField();
} else if (!clause.getField().equals(field)) {
throw new IllegalArgumentException("Clauses must have same field.");
}
this.clauses.add(clause);
addClause(clauses[i]);
}
}
/** Adds a clause to this query */
public final void addClause(SpanQuery clause) {
if (field == null) {
field = clause.getField();
} else if (!clause.getField().equals(field)) {
throw new IllegalArgumentException("Clauses must have same field.");
}
this.clauses.add(clause);
}
/** Return the clauses whose spans are matched. */
public SpanQuery[] getClauses() {
return clauses.toArray(new SpanQuery[clauses.size()]);

View File

@ -147,8 +147,8 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
@Override
protected TermsEnum getTermsEnum(IndexReader reader, AttributeSource atts) throws IOException {
return new TermRangeTermsEnum(reader, field, "2", "7", true, true, null) {
final MultiTermQuery.BoostAttribute boostAtt =
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
@Override
protected AcceptStatus accept(BytesRef term) {

View File

@ -0,0 +1,92 @@
package org.apache.lucene.search.spans;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
/**
* Tests for {@link SpanMultiTermQueryWrapper}, wrapping a few MultiTermQueries.
*/
public class TestSpanMultiTermQueryWrapper extends LuceneTestCase {
private Directory directory;
private IndexReader reader;
private Searcher searcher;
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random, directory);
Document doc = new Document();
Field field = newField("field", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(field);
field.setValue("quick brown fox");
iw.addDocument(doc);
field.setValue("jumps over lazy broun dog");
iw.addDocument(doc);
field.setValue("jumps over extremely very lazy broxn dog");
iw.addDocument(doc);
reader = iw.getReader();
iw.close();
searcher = new IndexSearcher(reader);
}
@Override
public void tearDown() throws Exception {
searcher.close();
reader.close();
directory.close();
super.tearDown();
}
public void testWildcard() throws Exception {
WildcardQuery wq = new WildcardQuery(new Term("field", "bro?n"));
SpanQuery swq = new SpanMultiTermQueryWrapper<WildcardQuery>(wq);
// will only match quick brown fox
SpanFirstQuery sfq = new SpanFirstQuery(swq, 2);
assertEquals(1, searcher.search(sfq, 10).totalHits);
}
public void testFuzzy() throws Exception {
FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"));
SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
// will not match quick brown fox
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 3, 6);
assertEquals(2, searcher.search(sprq, 10).totalHits);
}
public void testFuzzy2() throws Exception {
// maximum of 1 term expansion
FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1f, 0, 1);
SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
// will only match jumps over lazy broun dog
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100);
assertEquals(1, searcher.search(sprq, 10).totalHits);
}
}