diff --git a/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java b/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java index 3563331ac0e..5fa1c8eed3d 100644 --- a/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java +++ b/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java @@ -1,6 +1,5 @@ package org.apache.lucene.index.memory; -import org.apache.lucene.search.IndexSearcher; /* * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file diff --git a/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java new file mode 100644 index 00000000000..7e1e5eadd30 --- /dev/null +++ b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java @@ -0,0 +1,52 @@ +package org.apache.lucene.queries; +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +import org.apache.lucene.search.BooleanClause.Occur; +import org.elasticsearch.common.lucene.search.Queries; + +/** + * Extended version of {@link CommonTermsQuery} that allows to pass in a + * minimumNumberShouldMatch specification that uses the actual num of high frequent terms + * to calculate the minimum matching terms. + */ +public class ExtendedCommonTermsQuery extends XCommonTermsQuery { + + public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, boolean disableCoord) { + super(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoord); + } + + public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency) { + super(highFreqOccur, lowFreqOccur, maxTermFrequency); + } + + private String minNumShouldMatchSpec; + + @Override + protected int getMinimumNumberShouldMatch(int numOptional) { + if (minNumShouldMatchSpec == null) { + return 0; + } + return Queries.calculateMinShouldMatch(numOptional, minNumShouldMatchSpec); + } + + public void setMinimumNumberShouldMatch(String spec) { + this.minNumShouldMatchSpec = spec; + } + +} diff --git a/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java b/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java new file mode 100644 index 00000000000..f83649d8021 --- /dev/null +++ b/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java @@ -0,0 +1,381 @@ +package org.apache.lucene.queries; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.ToStringUtils; + +/** + * A query that executes high-frequency terms in a optional sub-query to prevent + * slow queries due to "common" terms like stopwords. This query basically + * builds 2 queries off the {@link #add(Term) added} terms where low-frequency + * terms are added to a required boolean clause and high-frequency terms are + * added to an optional boolean clause. The optional clause is only executed if + * the required "low-frequency' clause matches. Scores produced by this query + * will be slightly different to plain {@link BooleanQuery} scorer mainly due to + * differences in the {@link Similarity#coord(int,int) number of leave queries} + * in the required boolean clause. In the most cases high-frequency terms are + * unlikely to significantly contribute to the document score unless at least + * one of the low-frequency terms are matched such that this query can improve + * query execution times significantly if applicable. + *

+ * {@link XCommonTermsQuery} has several advantages over stopword filtering at + * index or query time since a term can be "classified" based on the actual + * document frequency in the index and can prevent slow queries even across + * domains without specialized stopword files. + *

+ *

+ * Note: if the query only contains high-frequency terms the query is + * rewritten into a plain conjunction query ie. all high-frequency terms need to + * match in order to match a document. + *

+ */ +//LUCENE MONITOR - Copied from CommonTermsQuery changes are tracked with //CHANGE +public class XCommonTermsQuery extends Query { + /* + * TODO maybe it would make sense to abstract this even further and allow to + * rewrite to dismax rather than boolean. Yet, this can already be subclassed + * to do so. + */ + protected final List terms = new ArrayList(); + protected final boolean disableCoord; + protected final float maxTermFrequency; + protected final Occur lowFreqOccur; + protected final Occur highFreqOccur; + protected float lowFreqBoost = 1.0f; + protected float highFreqBoost = 1.0f; + //CHANGE made minNr... a float for fractions + protected float minNrShouldMatch = 0; + + /** + * Creates a new {@link XCommonTermsQuery} + * + * @param highFreqOccur + * {@link Occur} used for high frequency terms + * @param lowFreqOccur + * {@link Occur} used for low frequency terms + * @param maxTermFrequency + * a value in [0..1] (or absolute number >=1) representing the + * maximum threshold of a terms document frequency to be considered a + * low frequency term. + * @throws IllegalArgumentException + * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or + * highFreqOccur + */ + public XCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, + float maxTermFrequency) { + this(highFreqOccur, lowFreqOccur, maxTermFrequency, false); + } + + /** + * Creates a new {@link XCommonTermsQuery} + * + * @param highFreqOccur + * {@link Occur} used for high frequency terms + * @param lowFreqOccur + * {@link Occur} used for low frequency terms + * @param maxTermFrequency + * a value in [0..1] (or absolute number >=1) representing the + * maximum threshold of a terms document frequency to be considered a + * low frequency term. + * @param disableCoord + * disables {@link Similarity#coord(int,int)} in scoring for the low + * / high frequency sub-queries + * @throws IllegalArgumentException + * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or + * highFreqOccur + */ + public XCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, + float maxTermFrequency, boolean disableCoord) { + if (highFreqOccur == Occur.MUST_NOT) { + throw new IllegalArgumentException( + "highFreqOccur should be MUST or SHOULD but was MUST_NOT"); + } + if (lowFreqOccur == Occur.MUST_NOT) { + throw new IllegalArgumentException( + "lowFreqOccur should be MUST or SHOULD but was MUST_NOT"); + } + this.disableCoord = disableCoord; + this.highFreqOccur = highFreqOccur; + this.lowFreqOccur = lowFreqOccur; + this.maxTermFrequency = maxTermFrequency; + } + + /** + * Adds a term to the {@link CommonTermsQuery} + * + * @param term + * the term to add + */ + public void add(Term term) { + if (term == null) { + throw new IllegalArgumentException("Term must not be null"); + } + this.terms.add(term); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + if (this.terms.isEmpty()) { + return new BooleanQuery(); + } else if (this.terms.size() == 1) { + final TermQuery tq = new TermQuery(this.terms.get(0)); + tq.setBoost(getBoost()); + return tq; + } + final List leaves = reader.leaves(); + final int maxDoc = reader.maxDoc(); + final TermContext[] contextArray = new TermContext[terms.size()]; + final Term[] queryTerms = this.terms.toArray(new Term[0]); + collectTermContext(reader, leaves, contextArray, queryTerms); + return buildQuery(maxDoc, contextArray, queryTerms); + } + + //CHANGE added to get num optional + protected int getMinimumNumberShouldMatch(int numOptional) { + if (minNrShouldMatch >= 1.0f) { + return (int) minNrShouldMatch; + } + return (int) (minNrShouldMatch * numOptional); + } + + protected Query buildQuery(final int maxDoc, + final TermContext[] contextArray, final Term[] queryTerms) { + BooleanQuery lowFreq = new BooleanQuery(disableCoord); + BooleanQuery highFreq = new BooleanQuery(disableCoord); + highFreq.setBoost(highFreqBoost); + lowFreq.setBoost(lowFreqBoost); + + BooleanQuery query = new BooleanQuery(true); + + for (int i = 0; i < queryTerms.length; i++) { + TermContext termContext = contextArray[i]; + if (termContext == null) { + lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur); + } else { + if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency) + || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency + * (float) maxDoc))) { + highFreq + .add(new TermQuery(queryTerms[i], termContext), highFreqOccur); + } else { + lowFreq.add(new TermQuery(queryTerms[i], termContext), lowFreqOccur); + } + } + + } + if (lowFreqOccur == Occur.SHOULD) { + lowFreq.setMinimumNumberShouldMatch(getMinimumNumberShouldMatch(lowFreq.clauses().size())); + } + if (lowFreq.clauses().isEmpty()) { + /* + * if lowFreq is empty we rewrite the high freq terms in a conjunction to + * prevent slow queries. + */ + if (highFreqOccur == Occur.MUST) { + highFreq.setBoost(getBoost()); + return highFreq; + } else { + BooleanQuery highFreqConjunction = new BooleanQuery(); + for (BooleanClause booleanClause : highFreq) { + highFreqConjunction.add(booleanClause.getQuery(), Occur.MUST); + } + highFreqConjunction.setBoost(getBoost()); + return highFreqConjunction; + + } + } else if (highFreq.clauses().isEmpty()) { + // only do low freq terms - we don't have high freq terms + lowFreq.setBoost(getBoost()); + return lowFreq; + } else { + query.add(highFreq, Occur.SHOULD); + query.add(lowFreq, Occur.MUST); + query.setBoost(getBoost()); + return query; + } + } + + public void collectTermContext(IndexReader reader, + List leaves, TermContext[] contextArray, + Term[] queryTerms) throws IOException { + TermsEnum termsEnum = null; + for (AtomicReaderContext context : leaves) { + final Fields fields = context.reader().fields(); + if (fields == null) { + // reader has no fields + continue; + } + for (int i = 0; i < queryTerms.length; i++) { + Term term = queryTerms[i]; + TermContext termContext = contextArray[i]; + final Terms terms = fields.terms(term.field()); + if (terms == null) { + // field does not exist + continue; + } + termsEnum = terms.iterator(termsEnum); + assert termsEnum != null; + + if (termsEnum == TermsEnum.EMPTY) continue; + if (termsEnum.seekExact(term.bytes(), false)) { + if (termContext == null) { + contextArray[i] = new TermContext(reader.getContext(), + termsEnum.termState(), context.ord, termsEnum.docFreq(), + termsEnum.totalTermFreq()); + } else { + termContext.register(termsEnum.termState(), context.ord, + termsEnum.docFreq(), termsEnum.totalTermFreq()); + } + + } + + } + } + } + + /** + * Returns true iff {@link Similarity#coord(int,int)} is disabled in scoring + * for the high and low frequency query instance. The top level query will + * always disable coords. + */ + public boolean isCoordDisabled() { + return disableCoord; + } + + /** + * Specifies a minimum number of the optional BooleanClauses which must be + * satisfied in order to produce a match on the low frequency terms query + * part. + * + *

+ * By default no optional clauses are necessary for a match (unless there are + * no required clauses). If this method is used, then the specified number of + * clauses is required. + *

+ * + * @param min + * the number of optional clauses that must match + */ + //CHANGE accepts now a float + public void setMinimumNumberShouldMatch(float min) { + this.minNrShouldMatch = min; + } + + /** + * Gets the minimum number of the optional BooleanClauses which must be + * satisfied. + */ + //CHANGE returns now a float + public float getMinimumNumberShouldMatch() { + return minNrShouldMatch; + } + + @Override + public void extractTerms(Set terms) { + terms.addAll(this.terms); + } + + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + boolean needParens = (getBoost() != 1.0) + || (getMinimumNumberShouldMatch() > 0); + if (needParens) { + buffer.append("("); + } + for (int i = 0; i < terms.size(); i++) { + Term t = terms.get(i); + buffer.append(new TermQuery(t).toString()); + + if (i != terms.size() - 1) buffer.append(", "); + } + if (needParens) { + buffer.append(")"); + } + if (getMinimumNumberShouldMatch() > 0) { + buffer.append('~'); + buffer.append(getMinimumNumberShouldMatch()); + } + if (getBoost() != 1.0f) { + buffer.append(ToStringUtils.boost(getBoost())); + } + return buffer.toString(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + (disableCoord ? 1231 : 1237); + result = prime * result + Float.floatToIntBits(highFreqBoost); + result = prime * result + + ((highFreqOccur == null) ? 0 : highFreqOccur.hashCode()); + result = prime * result + Float.floatToIntBits(lowFreqBoost); + result = prime * result + + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode()); + result = prime * result + Float.floatToIntBits(maxTermFrequency); + result = prime * result + Float.floatToIntBits(minNrShouldMatch); + result = prime * result + ((terms == null) ? 0 : terms.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!super.equals(obj)) return false; + if (getClass() != obj.getClass()) return false; + XCommonTermsQuery other = (XCommonTermsQuery) obj; + if (disableCoord != other.disableCoord) return false; + if (Float.floatToIntBits(highFreqBoost) != Float + .floatToIntBits(other.highFreqBoost)) return false; + if (highFreqOccur != other.highFreqOccur) return false; + if (Float.floatToIntBits(lowFreqBoost) != Float + .floatToIntBits(other.lowFreqBoost)) return false; + if (lowFreqOccur != other.lowFreqOccur) return false; + if (Float.floatToIntBits(maxTermFrequency) != Float + .floatToIntBits(other.maxTermFrequency)) return false; + if (minNrShouldMatch != other.minNrShouldMatch) return false; + if (terms == null) { + if (other.terms != null) return false; + } else if (!terms.equals(other.terms)) return false; + return true; + } + + //CHANGE added + public List terms() { + return this.terms; + } + +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java index d263897347c..941f1a8f45b 100644 --- a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.vectorhighlight; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queries.ExtendedCommonTermsQuery; import org.apache.lucene.queries.FilterClause; import org.apache.lucene.search.*; import org.apache.lucene.search.spans.SpanTermQuery; @@ -97,9 +98,11 @@ public class CustomFieldQuery extends FieldQuery { } } else if (sourceQuery instanceof FiltersFunctionScoreQuery) { flatten(((FiltersFunctionScoreQuery) sourceQuery).getSubQuery(), reader, flatQueries); + } else if (sourceQuery instanceof ExtendedCommonTermsQuery) { + flatten(((ExtendedCommonTermsQuery)sourceQuery).rewrite(reader), reader, flatQueries); } else { super.flatten(sourceQuery, reader, flatQueries); - } + } } void flatten(Filter sourceFilter, IndexReader reader, Collection flatQueries) throws IOException { diff --git a/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java index efc1ffa1231..565d161d297 100644 --- a/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java @@ -19,6 +19,8 @@ package org.elasticsearch.index.query; +import static org.elasticsearch.index.query.support.QueryParsers.wrapSmartNameQuery; + import java.io.IOException; import org.apache.lucene.analysis.Analyzer; @@ -26,6 +28,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.queries.ExtendedCommonTermsQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.Query; @@ -143,20 +146,15 @@ public class CommonTermsQueryParser implements QueryParser { if (value == null) { throw new QueryParsingException(parseContext.index(), "No text specified for text query"); } - CommonTermsQuery query = new CommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords); - int numTerms = parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer); - if (numTerms == 0) { - return null; - } - if (minimumShouldMatch != null) { - query.setMinimumNumberShouldMatch(Queries.calculateMinShouldMatch(numTerms, minimumShouldMatch)); - } + ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords); query.setBoost(boost); - return query; + return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, minimumShouldMatch); } + - private final int parseQueryString(CommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext, - String queryAnalyzer) throws IOException { + private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext, + String queryAnalyzer, String minimumShouldMatch) throws IOException { + FieldMapper mapper = null; String field; MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName); @@ -197,7 +195,11 @@ public class CommonTermsQueryParser implements QueryParser { query.add(new Term(field, ref)); count++; } - return count; - + + if (count == 0) { + return null; + } + query.setMinimumNumberShouldMatch(minimumShouldMatch); + return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext); } } \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java b/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java index c46cb880c61..809044d6221 100644 --- a/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java +++ b/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java @@ -86,6 +86,8 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer private Boolean fuzzyTranspositions = null; private ZeroTermsQuery zeroTermsQuery; + + private Float cutoff_Frequency = null; /** * Constructs a new text query. @@ -157,6 +159,16 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer this.maxExpansions = maxExpansions; return this; } + + /** + * Set a cutoff value in [0..1] (or absolute number >=1) representing the + * maximum threshold of a terms document frequency to be considered a low + * frequency term. + */ + public MatchQueryBuilder cutoffFrequency(float cutoff) { + this.cutoff_Frequency = cutoff; + return this; + } public MatchQueryBuilder minimumShouldMatch(String minimumShouldMatch) { this.minimumShouldMatch = minimumShouldMatch; @@ -241,6 +253,10 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer if (zeroTermsQuery != null) { builder.field("zero_terms_query", zeroTermsQuery.toString()); } + if (cutoff_Frequency != null) { + builder.field("cutoff_frequency", cutoff_Frequency); + } + builder.endObject(); builder.endObject(); diff --git a/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java b/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java index 6af62596fea..4cf53028233 100644 --- a/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java @@ -19,6 +19,8 @@ package org.elasticsearch.index.query; +import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.queries.ExtendedCommonTermsQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; @@ -126,6 +128,8 @@ public class MatchQueryParser implements QueryParser { matchQuery.setTranspositions(parser.booleanValue()); } else if ("lenient".equals(currentFieldName)) { matchQuery.setLenient(parser.booleanValue()); + } else if ("cutoff_frequency".equals(currentFieldName)) { + matchQuery.setCommonTermsCutoff(parser.floatValue()); } else if ("zero_terms_query".equals(currentFieldName)) { String zeroTermsDocs = parser.text(); if ("none".equalsIgnoreCase(zeroTermsDocs)) { @@ -161,8 +165,9 @@ public class MatchQueryParser implements QueryParser { if (query instanceof BooleanQuery) { Queries.applyMinimumShouldMatch((BooleanQuery) query, minimumShouldMatch); + } else if (query instanceof ExtendedCommonTermsQuery) { + ((ExtendedCommonTermsQuery)query).setMinimumNumberShouldMatch(minimumShouldMatch); } - query.setBoost(boost); return query; } diff --git a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java index 67bf93d9888..289bda22dcb 100644 --- a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java +++ b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java @@ -67,6 +67,8 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl private Boolean lenient; + private Float cutoffFrequency = null; + /** * Constructs a new text query. */ @@ -191,6 +193,17 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl this.lenient = lenient; return this; } + + + /** + * Set a cutoff value in [0..1] (or absolute number >=1) representing the + * maximum threshold of a terms document frequency to be considered a low + * frequency term. + */ + public MultiMatchQueryBuilder cutoffFrequency(float cutoff) { + this.cutoffFrequency = cutoff; + return this; + } @Override public void doXContent(XContentBuilder builder, Params params) throws IOException { @@ -255,6 +268,10 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl if (lenient != null) { builder.field("lenient", lenient); } + + if (cutoffFrequency != null) { + builder.field("cutoff_frequency", cutoffFrequency); + } builder.endObject(); } diff --git a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java index ca8466fc41a..4a1ba050958 100644 --- a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java @@ -145,6 +145,8 @@ public class MultiMatchQueryParser implements QueryParser { multiMatchQuery.setUseDisMax(parser.booleanValue()); } else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) { multiMatchQuery.setTieBreaker(parser.floatValue()); + } else if ("cutoff_frequency".equals(currentFieldName)) { + multiMatchQuery.setCommonTermsCutoff(parser.floatValue()); } else if ("lenient".equals(currentFieldName)) { multiMatchQuery.setLenient(parser.booleanValue()); } else { diff --git a/src/main/java/org/elasticsearch/index/search/MatchQuery.java b/src/main/java/org/elasticsearch/index/search/MatchQuery.java index d057a95deb4..efa7eab3f3a 100644 --- a/src/main/java/org/elasticsearch/index/search/MatchQuery.java +++ b/src/main/java/org/elasticsearch/index/search/MatchQuery.java @@ -25,6 +25,8 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.queries.ExtendedCommonTermsQuery; import org.apache.lucene.search.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; @@ -70,19 +72,24 @@ public class MatchQuery { protected int phraseSlop = 0; protected String fuzziness = null; + protected int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; + protected int maxExpansions = FuzzyQuery.defaultMaxExpansions; + //LUCENE 4 UPGRADE we need a default value for this! protected boolean transpositions = false; - protected MultiTermQuery.RewriteMethod rewriteMethod; + protected MultiTermQuery.RewriteMethod fuzzyRewriteMethod; protected boolean lenient; protected ZeroTermsQuery zeroTermsQuery = ZeroTermsQuery.NONE; - + + protected Float commonTermsCutoff = null; + public MatchQuery(QueryParseContext parseContext) { this.parseContext = parseContext; } @@ -94,6 +101,10 @@ public class MatchQuery { public void setOccur(BooleanClause.Occur occur) { this.occur = occur; } + + public void setCommonTermsCutoff(float cutoff) { + this.commonTermsCutoff = Float.valueOf(cutoff); + } public void setEnablePositionIncrements(boolean enablePositionIncrements) { this.enablePositionIncrements = enablePositionIncrements; @@ -221,19 +232,27 @@ public class MatchQuery { if (numTokens == 1) { boolean hasNext = buffer.incrementToken(); assert hasNext == true; - //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8 - final Query q = newTermQuery(mapper, new Term(field, termToByteRef(termAtt, new BytesRef()))); + final Query q = newTermQuery(mapper, new Term(field, termToByteRef(termAtt))); return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext); } - BooleanQuery q = new BooleanQuery(positionCount == 1); - for (int i = 0; i < numTokens; i++) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8 - final Query currentQuery = newTermQuery(mapper, new Term(field, termToByteRef(termAtt, new BytesRef()))); - q.add(currentQuery, occur); + if (commonTermsCutoff != null) { + ExtendedCommonTermsQuery q = new ExtendedCommonTermsQuery(occur, occur, commonTermsCutoff, positionCount == 1); + for (int i = 0; i < numTokens; i++) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + q.add(new Term(field, termToByteRef(termAtt))); + } + return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext); + } else { + BooleanQuery q = new BooleanQuery(positionCount == 1); + for (int i = 0; i < numTokens; i++) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + final Query currentQuery = newTermQuery(mapper, new Term(field, termToByteRef(termAtt))); + q.add(currentQuery, occur); + } + return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext); } - return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext); } else if (type == Type.PHRASE) { if (severalTokensAtSamePosition) { final MultiPhraseQuery mpq = new MultiPhraseQuery(); @@ -256,7 +275,7 @@ public class MatchQuery { } position += positionIncrement; //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8 - multiTerms.add(new Term(field, termToByteRef(termAtt, new BytesRef()))); + multiTerms.add(new Term(field, termToByteRef(termAtt))); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position); @@ -277,9 +296,9 @@ public class MatchQuery { if (enablePositionIncrements) { position += positionIncrement; //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8 - pq.add(new Term(field, termToByteRef(termAtt, new BytesRef())), position); + pq.add(new Term(field, termToByteRef(termAtt)), position); } else { - pq.add(new Term(field, termToByteRef(termAtt, new BytesRef()))); + pq.add(new Term(field, termToByteRef(termAtt))); } } return wrapSmartNameQuery(pq, smartNameFieldMappers, parseContext); @@ -305,8 +324,7 @@ public class MatchQuery { multiTerms.clear(); } position += positionIncrement; - //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8 - multiTerms.add(new Term(field, termToByteRef(termAtt, new BytesRef()))); + multiTerms.add(new Term(field, termToByteRef(termAtt))); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position); @@ -343,8 +361,9 @@ public class MatchQuery { } return new TermQuery(term); } - - private static BytesRef termToByteRef(CharTermAttribute attr, BytesRef ref) { + + private static BytesRef termToByteRef(CharTermAttribute attr) { + final BytesRef ref = new BytesRef(); UnicodeUtil.UTF16toUTF8(attr.buffer(), 0, attr.length(), ref); return ref; } diff --git a/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java b/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java index 4a9edd6a5d2..c118893ec5a 100644 --- a/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java +++ b/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java @@ -344,7 +344,7 @@ public class SearchServiceTransportAction extends AbstractComponent { try { FetchSearchResult result = searchService.executeFetchPhase(request); listener.onResult(result); - } catch (Exception e) { + } catch (Throwable e) { listener.onFailure(e); } } else { @@ -378,7 +378,7 @@ public class SearchServiceTransportAction extends AbstractComponent { try { QuerySearchResult result = searchService.executeScan(request); listener.onResult(result); - } catch (Exception e) { + } catch (Throwable e) { listener.onFailure(e); } } else { diff --git a/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java b/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java index 24b71a692bb..44a7ce11097 100644 --- a/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java +++ b/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java @@ -20,9 +20,14 @@ package org.elasticsearch.search.highlight; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.XCommonTermsQuery; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.WeightedSpanTerm; import org.apache.lucene.search.highlight.WeightedSpanTermExtractor; @@ -31,6 +36,7 @@ import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; import java.io.IOException; +import java.util.List; import java.util.Map; public final class CustomQueryScorer extends QueryScorer { @@ -97,6 +103,14 @@ public final class CustomQueryScorer extends QueryScorer { } else if (query instanceof XFilteredQuery) { query = ((XFilteredQuery) query).getQuery(); extract(query, terms); + } else if (query instanceof XCommonTermsQuery) { + XCommonTermsQuery ctq = ((XCommonTermsQuery)query); + List ctqTerms = ctq.terms(); + BooleanQuery bq = new BooleanQuery(); + for (Term term : ctqTerms) { + bq.add(new TermQuery(term), Occur.SHOULD); + } + extract(bq, terms); } } diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java index a06a43649e1..24edaef385b 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java @@ -947,6 +947,59 @@ public class HighlighterSearchTests extends AbstractNodesTests { assertThat(response.hits().hits()[0].highlightFields().get("tags").fragments()[0].string(), equalTo("this is a really long tag i would like to highlight")); assertThat(response.hits().hits()[0].highlightFields().get("tags").fragments()[1].string(), equalTo("here is another one that is very long and has the tag token near the end")); } + + @Test + public void testCommonTermsQuery() { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (IndexMissingException e) { + // its ok + } + client.admin().indices().prepareCreate("test").execute().actionGet(); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().execute().actionGet(); + + client.prepareIndex("test", "type1") + .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog") + .setRefresh(true).execute().actionGet(); + + logger.info("--> highlighting and searching on field1"); + SearchSourceBuilder source = searchSource() + .query(commonTerms("field2", "quick brown").cutoffFrequency(100)) + .from(0).size(60).explain(true) + .highlight(highlight().field("field2").order("score").preTags("").postTags("")); + + SearchResponse searchResponse = client.search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH).scroll(timeValueMinutes(10))).actionGet(); + assertThat("Failures " + Arrays.toString(searchResponse.shardFailures()), searchResponse.shardFailures().length, equalTo(0)); + assertThat(searchResponse.hits().totalHits(), equalTo(1l)); + + assertThat(searchResponse.hits().getAt(0).highlightFields().get("field2").fragments()[0].string(), equalTo("The quick brown fox jumps over the lazy dog")); + } + + @Test + public void testCommonTermsTermVector() throws ElasticSearchException, IOException { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + client.admin().indices().prepareCreate("test").addMapping("type1", type1TermVectorMapping()).execute().actionGet(); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().execute().actionGet(); + + client.prepareIndex("test", "type1").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog") + .setRefresh(true).execute().actionGet(); + + logger.info("--> highlighting and searching on field1"); + SearchSourceBuilder source = searchSource().query(commonTerms("field2", "quick brown").cutoffFrequency(100)).from(0).size(60) + .explain(true).highlight(highlight().field("field2").order("score").preTags("").postTags("")); + + SearchResponse searchResponse = client.search( + searchRequest("test").source(source).searchType(QUERY_THEN_FETCH).scroll(timeValueMinutes(10))).actionGet(); + assertThat("Failures " + Arrays.toString(searchResponse.shardFailures()), searchResponse.shardFailures().length, equalTo(0)); + assertThat(searchResponse.hits().totalHits(), equalTo(1l)); + + assertThat(searchResponse.hits().getAt(0).highlightFields().get("field2").fragments()[0].string(), + equalTo("The quick brown fox jumps over the lazy dog")); + } @Test public void testPlainHighlightDifferentFragmenter() throws Exception { diff --git a/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java b/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java index f0fe9fde574..26aecb35756 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java @@ -120,7 +120,7 @@ public class SimpleQueryTests extends AbstractNodesTests { client.prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox").execute().actionGet(); client.prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree").execute().actionGet(); - client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown").setRefresh(true).execute().actionGet(); + client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown", "field2", "the quick lazy huge brown fox jumps over the tree").setRefresh(true).execute().actionGet(); SearchResponse searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3)).execute().actionGet(); assertThat(searchResponse.hits().totalHits(), equalTo(2l)); @@ -140,6 +140,32 @@ public class SimpleQueryTests extends AbstractNodesTests { assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1")); assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3")); assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2")); + + // try the same with match query + searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).execute().actionGet(); + assertThat(searchResponse.hits().totalHits(), equalTo(2l)); + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1")); + assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2")); + + searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.OR)).execute().actionGet(); + assertThat(searchResponse.hits().totalHits(), equalTo(3l)); + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1")); + assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2")); + assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3")); + + searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("standard")).execute().actionGet(); + assertThat(searchResponse.hits().totalHits(), equalTo(3l)); + // standard drops "the" since its a stopword + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1")); + assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3")); + assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2")); + + // try the same with multi match query + searchResponse = client.prepareSearch().setQuery(QueryBuilders.multiMatchQuery("the quick brown", "field1", "field2").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).execute().actionGet(); + assertThat(searchResponse.hits().totalHits(), equalTo(3l)); + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("3")); // better score due to different query stats + assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("1")); + assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2")); } @Test