diff --git a/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java b/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java
index 3563331ac0e..5fa1c8eed3d 100644
--- a/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java
+++ b/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java
@@ -1,6 +1,5 @@
package org.apache.lucene.index.memory;
-import org.apache.lucene.search.IndexSearcher;
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
diff --git a/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java
new file mode 100644
index 00000000000..7e1e5eadd30
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java
@@ -0,0 +1,52 @@
+package org.apache.lucene.queries;
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.elasticsearch.common.lucene.search.Queries;
+
+/**
+ * Extended version of {@link CommonTermsQuery} that allows to pass in a
+ * minimumNumberShouldMatch specification that uses the actual num of high frequent terms
+ * to calculate the minimum matching terms.
+ */
+public class ExtendedCommonTermsQuery extends XCommonTermsQuery {
+
+ public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, boolean disableCoord) {
+ super(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoord);
+ }
+
+ public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency) {
+ super(highFreqOccur, lowFreqOccur, maxTermFrequency);
+ }
+
+ private String minNumShouldMatchSpec;
+
+ @Override
+ protected int getMinimumNumberShouldMatch(int numOptional) {
+ if (minNumShouldMatchSpec == null) {
+ return 0;
+ }
+ return Queries.calculateMinShouldMatch(numOptional, minNumShouldMatchSpec);
+ }
+
+ public void setMinimumNumberShouldMatch(String spec) {
+ this.minNumShouldMatchSpec = spec;
+ }
+
+}
diff --git a/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java b/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java
new file mode 100644
index 00000000000..f83649d8021
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java
@@ -0,0 +1,381 @@
+package org.apache.lucene.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.ToStringUtils;
+
+/**
+ * A query that executes high-frequency terms in a optional sub-query to prevent
+ * slow queries due to "common" terms like stopwords. This query basically
+ * builds 2 queries off the {@link #add(Term) added} terms where low-frequency
+ * terms are added to a required boolean clause and high-frequency terms are
+ * added to an optional boolean clause. The optional clause is only executed if
+ * the required "low-frequency' clause matches. Scores produced by this query
+ * will be slightly different to plain {@link BooleanQuery} scorer mainly due to
+ * differences in the {@link Similarity#coord(int,int) number of leave queries}
+ * in the required boolean clause. In the most cases high-frequency terms are
+ * unlikely to significantly contribute to the document score unless at least
+ * one of the low-frequency terms are matched such that this query can improve
+ * query execution times significantly if applicable.
+ *
+ * {@link XCommonTermsQuery} has several advantages over stopword filtering at
+ * index or query time since a term can be "classified" based on the actual
+ * document frequency in the index and can prevent slow queries even across
+ * domains without specialized stopword files.
+ *
+ *
+ * Note: if the query only contains high-frequency terms the query is
+ * rewritten into a plain conjunction query ie. all high-frequency terms need to
+ * match in order to match a document.
+ *
+ */
+//LUCENE MONITOR - Copied from CommonTermsQuery changes are tracked with //CHANGE
+public class XCommonTermsQuery extends Query {
+ /*
+ * TODO maybe it would make sense to abstract this even further and allow to
+ * rewrite to dismax rather than boolean. Yet, this can already be subclassed
+ * to do so.
+ */
+ protected final List terms = new ArrayList();
+ protected final boolean disableCoord;
+ protected final float maxTermFrequency;
+ protected final Occur lowFreqOccur;
+ protected final Occur highFreqOccur;
+ protected float lowFreqBoost = 1.0f;
+ protected float highFreqBoost = 1.0f;
+ //CHANGE made minNr... a float for fractions
+ protected float minNrShouldMatch = 0;
+
+ /**
+ * Creates a new {@link XCommonTermsQuery}
+ *
+ * @param highFreqOccur
+ * {@link Occur} used for high frequency terms
+ * @param lowFreqOccur
+ * {@link Occur} used for low frequency terms
+ * @param maxTermFrequency
+ * a value in [0..1] (or absolute number >=1) representing the
+ * maximum threshold of a terms document frequency to be considered a
+ * low frequency term.
+ * @throws IllegalArgumentException
+ * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or
+ * highFreqOccur
+ */
+ public XCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur,
+ float maxTermFrequency) {
+ this(highFreqOccur, lowFreqOccur, maxTermFrequency, false);
+ }
+
+ /**
+ * Creates a new {@link XCommonTermsQuery}
+ *
+ * @param highFreqOccur
+ * {@link Occur} used for high frequency terms
+ * @param lowFreqOccur
+ * {@link Occur} used for low frequency terms
+ * @param maxTermFrequency
+ * a value in [0..1] (or absolute number >=1) representing the
+ * maximum threshold of a terms document frequency to be considered a
+ * low frequency term.
+ * @param disableCoord
+ * disables {@link Similarity#coord(int,int)} in scoring for the low
+ * / high frequency sub-queries
+ * @throws IllegalArgumentException
+ * if {@link Occur#MUST_NOT} is pass as lowFreqOccur or
+ * highFreqOccur
+ */
+ public XCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur,
+ float maxTermFrequency, boolean disableCoord) {
+ if (highFreqOccur == Occur.MUST_NOT) {
+ throw new IllegalArgumentException(
+ "highFreqOccur should be MUST or SHOULD but was MUST_NOT");
+ }
+ if (lowFreqOccur == Occur.MUST_NOT) {
+ throw new IllegalArgumentException(
+ "lowFreqOccur should be MUST or SHOULD but was MUST_NOT");
+ }
+ this.disableCoord = disableCoord;
+ this.highFreqOccur = highFreqOccur;
+ this.lowFreqOccur = lowFreqOccur;
+ this.maxTermFrequency = maxTermFrequency;
+ }
+
+ /**
+ * Adds a term to the {@link CommonTermsQuery}
+ *
+ * @param term
+ * the term to add
+ */
+ public void add(Term term) {
+ if (term == null) {
+ throw new IllegalArgumentException("Term must not be null");
+ }
+ this.terms.add(term);
+ }
+
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException {
+ if (this.terms.isEmpty()) {
+ return new BooleanQuery();
+ } else if (this.terms.size() == 1) {
+ final TermQuery tq = new TermQuery(this.terms.get(0));
+ tq.setBoost(getBoost());
+ return tq;
+ }
+ final List leaves = reader.leaves();
+ final int maxDoc = reader.maxDoc();
+ final TermContext[] contextArray = new TermContext[terms.size()];
+ final Term[] queryTerms = this.terms.toArray(new Term[0]);
+ collectTermContext(reader, leaves, contextArray, queryTerms);
+ return buildQuery(maxDoc, contextArray, queryTerms);
+ }
+
+ //CHANGE added to get num optional
+ protected int getMinimumNumberShouldMatch(int numOptional) {
+ if (minNrShouldMatch >= 1.0f) {
+ return (int) minNrShouldMatch;
+ }
+ return (int) (minNrShouldMatch * numOptional);
+ }
+
+ protected Query buildQuery(final int maxDoc,
+ final TermContext[] contextArray, final Term[] queryTerms) {
+ BooleanQuery lowFreq = new BooleanQuery(disableCoord);
+ BooleanQuery highFreq = new BooleanQuery(disableCoord);
+ highFreq.setBoost(highFreqBoost);
+ lowFreq.setBoost(lowFreqBoost);
+
+ BooleanQuery query = new BooleanQuery(true);
+
+ for (int i = 0; i < queryTerms.length; i++) {
+ TermContext termContext = contextArray[i];
+ if (termContext == null) {
+ lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur);
+ } else {
+ if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency)
+ || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency
+ * (float) maxDoc))) {
+ highFreq
+ .add(new TermQuery(queryTerms[i], termContext), highFreqOccur);
+ } else {
+ lowFreq.add(new TermQuery(queryTerms[i], termContext), lowFreqOccur);
+ }
+ }
+
+ }
+ if (lowFreqOccur == Occur.SHOULD) {
+ lowFreq.setMinimumNumberShouldMatch(getMinimumNumberShouldMatch(lowFreq.clauses().size()));
+ }
+ if (lowFreq.clauses().isEmpty()) {
+ /*
+ * if lowFreq is empty we rewrite the high freq terms in a conjunction to
+ * prevent slow queries.
+ */
+ if (highFreqOccur == Occur.MUST) {
+ highFreq.setBoost(getBoost());
+ return highFreq;
+ } else {
+ BooleanQuery highFreqConjunction = new BooleanQuery();
+ for (BooleanClause booleanClause : highFreq) {
+ highFreqConjunction.add(booleanClause.getQuery(), Occur.MUST);
+ }
+ highFreqConjunction.setBoost(getBoost());
+ return highFreqConjunction;
+
+ }
+ } else if (highFreq.clauses().isEmpty()) {
+ // only do low freq terms - we don't have high freq terms
+ lowFreq.setBoost(getBoost());
+ return lowFreq;
+ } else {
+ query.add(highFreq, Occur.SHOULD);
+ query.add(lowFreq, Occur.MUST);
+ query.setBoost(getBoost());
+ return query;
+ }
+ }
+
+ public void collectTermContext(IndexReader reader,
+ List leaves, TermContext[] contextArray,
+ Term[] queryTerms) throws IOException {
+ TermsEnum termsEnum = null;
+ for (AtomicReaderContext context : leaves) {
+ final Fields fields = context.reader().fields();
+ if (fields == null) {
+ // reader has no fields
+ continue;
+ }
+ for (int i = 0; i < queryTerms.length; i++) {
+ Term term = queryTerms[i];
+ TermContext termContext = contextArray[i];
+ final Terms terms = fields.terms(term.field());
+ if (terms == null) {
+ // field does not exist
+ continue;
+ }
+ termsEnum = terms.iterator(termsEnum);
+ assert termsEnum != null;
+
+ if (termsEnum == TermsEnum.EMPTY) continue;
+ if (termsEnum.seekExact(term.bytes(), false)) {
+ if (termContext == null) {
+ contextArray[i] = new TermContext(reader.getContext(),
+ termsEnum.termState(), context.ord, termsEnum.docFreq(),
+ termsEnum.totalTermFreq());
+ } else {
+ termContext.register(termsEnum.termState(), context.ord,
+ termsEnum.docFreq(), termsEnum.totalTermFreq());
+ }
+
+ }
+
+ }
+ }
+ }
+
+ /**
+ * Returns true iff {@link Similarity#coord(int,int)} is disabled in scoring
+ * for the high and low frequency query instance. The top level query will
+ * always disable coords.
+ */
+ public boolean isCoordDisabled() {
+ return disableCoord;
+ }
+
+ /**
+ * Specifies a minimum number of the optional BooleanClauses which must be
+ * satisfied in order to produce a match on the low frequency terms query
+ * part.
+ *
+ *
+ * By default no optional clauses are necessary for a match (unless there are
+ * no required clauses). If this method is used, then the specified number of
+ * clauses is required.
+ *
+ *
+ * @param min
+ * the number of optional clauses that must match
+ */
+ //CHANGE accepts now a float
+ public void setMinimumNumberShouldMatch(float min) {
+ this.minNrShouldMatch = min;
+ }
+
+ /**
+ * Gets the minimum number of the optional BooleanClauses which must be
+ * satisfied.
+ */
+ //CHANGE returns now a float
+ public float getMinimumNumberShouldMatch() {
+ return minNrShouldMatch;
+ }
+
+ @Override
+ public void extractTerms(Set terms) {
+ terms.addAll(this.terms);
+ }
+
+ @Override
+ public String toString(String field) {
+ StringBuilder buffer = new StringBuilder();
+ boolean needParens = (getBoost() != 1.0)
+ || (getMinimumNumberShouldMatch() > 0);
+ if (needParens) {
+ buffer.append("(");
+ }
+ for (int i = 0; i < terms.size(); i++) {
+ Term t = terms.get(i);
+ buffer.append(new TermQuery(t).toString());
+
+ if (i != terms.size() - 1) buffer.append(", ");
+ }
+ if (needParens) {
+ buffer.append(")");
+ }
+ if (getMinimumNumberShouldMatch() > 0) {
+ buffer.append('~');
+ buffer.append(getMinimumNumberShouldMatch());
+ }
+ if (getBoost() != 1.0f) {
+ buffer.append(ToStringUtils.boost(getBoost()));
+ }
+ return buffer.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + (disableCoord ? 1231 : 1237);
+ result = prime * result + Float.floatToIntBits(highFreqBoost);
+ result = prime * result
+ + ((highFreqOccur == null) ? 0 : highFreqOccur.hashCode());
+ result = prime * result + Float.floatToIntBits(lowFreqBoost);
+ result = prime * result
+ + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
+ result = prime * result + Float.floatToIntBits(maxTermFrequency);
+ result = prime * result + Float.floatToIntBits(minNrShouldMatch);
+ result = prime * result + ((terms == null) ? 0 : terms.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (!super.equals(obj)) return false;
+ if (getClass() != obj.getClass()) return false;
+ XCommonTermsQuery other = (XCommonTermsQuery) obj;
+ if (disableCoord != other.disableCoord) return false;
+ if (Float.floatToIntBits(highFreqBoost) != Float
+ .floatToIntBits(other.highFreqBoost)) return false;
+ if (highFreqOccur != other.highFreqOccur) return false;
+ if (Float.floatToIntBits(lowFreqBoost) != Float
+ .floatToIntBits(other.lowFreqBoost)) return false;
+ if (lowFreqOccur != other.lowFreqOccur) return false;
+ if (Float.floatToIntBits(maxTermFrequency) != Float
+ .floatToIntBits(other.maxTermFrequency)) return false;
+ if (minNrShouldMatch != other.minNrShouldMatch) return false;
+ if (terms == null) {
+ if (other.terms != null) return false;
+ } else if (!terms.equals(other.terms)) return false;
+ return true;
+ }
+
+ //CHANGE added
+ public List terms() {
+ return this.terms;
+ }
+
+}
diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java
index d263897347c..941f1a8f45b 100644
--- a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java
+++ b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java
@@ -20,6 +20,7 @@
package org.apache.lucene.search.vectorhighlight;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.queries.FilterClause;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.SpanTermQuery;
@@ -97,9 +98,11 @@ public class CustomFieldQuery extends FieldQuery {
}
} else if (sourceQuery instanceof FiltersFunctionScoreQuery) {
flatten(((FiltersFunctionScoreQuery) sourceQuery).getSubQuery(), reader, flatQueries);
+ } else if (sourceQuery instanceof ExtendedCommonTermsQuery) {
+ flatten(((ExtendedCommonTermsQuery)sourceQuery).rewrite(reader), reader, flatQueries);
} else {
super.flatten(sourceQuery, reader, flatQueries);
- }
+ }
}
void flatten(Filter sourceFilter, IndexReader reader, Collection flatQueries) throws IOException {
diff --git a/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java
index efc1ffa1231..565d161d297 100644
--- a/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java
+++ b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java
@@ -19,6 +19,8 @@
package org.elasticsearch.index.query;
+import static org.elasticsearch.index.query.support.QueryParsers.wrapSmartNameQuery;
+
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
@@ -26,6 +28,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.Query;
@@ -143,20 +146,15 @@ public class CommonTermsQueryParser implements QueryParser {
if (value == null) {
throw new QueryParsingException(parseContext.index(), "No text specified for text query");
}
- CommonTermsQuery query = new CommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
- int numTerms = parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer);
- if (numTerms == 0) {
- return null;
- }
- if (minimumShouldMatch != null) {
- query.setMinimumNumberShouldMatch(Queries.calculateMinShouldMatch(numTerms, minimumShouldMatch));
- }
+ ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
query.setBoost(boost);
- return query;
+ return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, minimumShouldMatch);
}
+
- private final int parseQueryString(CommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
- String queryAnalyzer) throws IOException {
+ private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
+ String queryAnalyzer, String minimumShouldMatch) throws IOException {
+
FieldMapper> mapper = null;
String field;
MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName);
@@ -197,7 +195,11 @@ public class CommonTermsQueryParser implements QueryParser {
query.add(new Term(field, ref));
count++;
}
- return count;
-
+
+ if (count == 0) {
+ return null;
+ }
+ query.setMinimumNumberShouldMatch(minimumShouldMatch);
+ return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext);
}
}
\ No newline at end of file
diff --git a/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java b/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java
index c46cb880c61..809044d6221 100644
--- a/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java
+++ b/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java
@@ -86,6 +86,8 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer
private Boolean fuzzyTranspositions = null;
private ZeroTermsQuery zeroTermsQuery;
+
+ private Float cutoff_Frequency = null;
/**
* Constructs a new text query.
@@ -157,6 +159,16 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer
this.maxExpansions = maxExpansions;
return this;
}
+
+ /**
+ * Set a cutoff value in [0..1] (or absolute number >=1) representing the
+ * maximum threshold of a terms document frequency to be considered a low
+ * frequency term.
+ */
+ public MatchQueryBuilder cutoffFrequency(float cutoff) {
+ this.cutoff_Frequency = cutoff;
+ return this;
+ }
public MatchQueryBuilder minimumShouldMatch(String minimumShouldMatch) {
this.minimumShouldMatch = minimumShouldMatch;
@@ -241,6 +253,10 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer
if (zeroTermsQuery != null) {
builder.field("zero_terms_query", zeroTermsQuery.toString());
}
+ if (cutoff_Frequency != null) {
+ builder.field("cutoff_frequency", cutoff_Frequency);
+ }
+
builder.endObject();
builder.endObject();
diff --git a/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java b/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java
index 6af62596fea..4cf53028233 100644
--- a/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java
+++ b/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java
@@ -19,6 +19,8 @@
package org.elasticsearch.index.query;
+import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
@@ -126,6 +128,8 @@ public class MatchQueryParser implements QueryParser {
matchQuery.setTranspositions(parser.booleanValue());
} else if ("lenient".equals(currentFieldName)) {
matchQuery.setLenient(parser.booleanValue());
+ } else if ("cutoff_frequency".equals(currentFieldName)) {
+ matchQuery.setCommonTermsCutoff(parser.floatValue());
} else if ("zero_terms_query".equals(currentFieldName)) {
String zeroTermsDocs = parser.text();
if ("none".equalsIgnoreCase(zeroTermsDocs)) {
@@ -161,8 +165,9 @@ public class MatchQueryParser implements QueryParser {
if (query instanceof BooleanQuery) {
Queries.applyMinimumShouldMatch((BooleanQuery) query, minimumShouldMatch);
+ } else if (query instanceof ExtendedCommonTermsQuery) {
+ ((ExtendedCommonTermsQuery)query).setMinimumNumberShouldMatch(minimumShouldMatch);
}
-
query.setBoost(boost);
return query;
}
diff --git a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java
index 67bf93d9888..289bda22dcb 100644
--- a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java
+++ b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java
@@ -67,6 +67,8 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl
private Boolean lenient;
+ private Float cutoffFrequency = null;
+
/**
* Constructs a new text query.
*/
@@ -191,6 +193,17 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl
this.lenient = lenient;
return this;
}
+
+
+ /**
+ * Set a cutoff value in [0..1] (or absolute number >=1) representing the
+ * maximum threshold of a terms document frequency to be considered a low
+ * frequency term.
+ */
+ public MultiMatchQueryBuilder cutoffFrequency(float cutoff) {
+ this.cutoffFrequency = cutoff;
+ return this;
+ }
@Override
public void doXContent(XContentBuilder builder, Params params) throws IOException {
@@ -255,6 +268,10 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl
if (lenient != null) {
builder.field("lenient", lenient);
}
+
+ if (cutoffFrequency != null) {
+ builder.field("cutoff_frequency", cutoffFrequency);
+ }
builder.endObject();
}
diff --git a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java
index ca8466fc41a..4a1ba050958 100644
--- a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java
+++ b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java
@@ -145,6 +145,8 @@ public class MultiMatchQueryParser implements QueryParser {
multiMatchQuery.setUseDisMax(parser.booleanValue());
} else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) {
multiMatchQuery.setTieBreaker(parser.floatValue());
+ } else if ("cutoff_frequency".equals(currentFieldName)) {
+ multiMatchQuery.setCommonTermsCutoff(parser.floatValue());
} else if ("lenient".equals(currentFieldName)) {
multiMatchQuery.setLenient(parser.booleanValue());
} else {
diff --git a/src/main/java/org/elasticsearch/index/search/MatchQuery.java b/src/main/java/org/elasticsearch/index/search/MatchQuery.java
index d057a95deb4..efa7eab3f3a 100644
--- a/src/main/java/org/elasticsearch/index/search/MatchQuery.java
+++ b/src/main/java/org/elasticsearch/index/search/MatchQuery.java
@@ -25,6 +25,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
@@ -70,19 +72,24 @@ public class MatchQuery {
protected int phraseSlop = 0;
protected String fuzziness = null;
+
protected int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
+
protected int maxExpansions = FuzzyQuery.defaultMaxExpansions;
+
//LUCENE 4 UPGRADE we need a default value for this!
protected boolean transpositions = false;
-
protected MultiTermQuery.RewriteMethod rewriteMethod;
+
protected MultiTermQuery.RewriteMethod fuzzyRewriteMethod;
protected boolean lenient;
protected ZeroTermsQuery zeroTermsQuery = ZeroTermsQuery.NONE;
-
+
+ protected Float commonTermsCutoff = null;
+
public MatchQuery(QueryParseContext parseContext) {
this.parseContext = parseContext;
}
@@ -94,6 +101,10 @@ public class MatchQuery {
public void setOccur(BooleanClause.Occur occur) {
this.occur = occur;
}
+
+ public void setCommonTermsCutoff(float cutoff) {
+ this.commonTermsCutoff = Float.valueOf(cutoff);
+ }
public void setEnablePositionIncrements(boolean enablePositionIncrements) {
this.enablePositionIncrements = enablePositionIncrements;
@@ -221,19 +232,27 @@ public class MatchQuery {
if (numTokens == 1) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
- //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
- final Query q = newTermQuery(mapper, new Term(field, termToByteRef(termAtt, new BytesRef())));
+ final Query q = newTermQuery(mapper, new Term(field, termToByteRef(termAtt)));
return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
}
- BooleanQuery q = new BooleanQuery(positionCount == 1);
- for (int i = 0; i < numTokens; i++) {
- boolean hasNext = buffer.incrementToken();
- assert hasNext == true;
- //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
- final Query currentQuery = newTermQuery(mapper, new Term(field, termToByteRef(termAtt, new BytesRef())));
- q.add(currentQuery, occur);
+ if (commonTermsCutoff != null) {
+ ExtendedCommonTermsQuery q = new ExtendedCommonTermsQuery(occur, occur, commonTermsCutoff, positionCount == 1);
+ for (int i = 0; i < numTokens; i++) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ q.add(new Term(field, termToByteRef(termAtt)));
+ }
+ return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
+ } else {
+ BooleanQuery q = new BooleanQuery(positionCount == 1);
+ for (int i = 0; i < numTokens; i++) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ final Query currentQuery = newTermQuery(mapper, new Term(field, termToByteRef(termAtt)));
+ q.add(currentQuery, occur);
+ }
+ return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
}
- return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
} else if (type == Type.PHRASE) {
if (severalTokensAtSamePosition) {
final MultiPhraseQuery mpq = new MultiPhraseQuery();
@@ -256,7 +275,7 @@ public class MatchQuery {
}
position += positionIncrement;
//LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
- multiTerms.add(new Term(field, termToByteRef(termAtt, new BytesRef())));
+ multiTerms.add(new Term(field, termToByteRef(termAtt)));
}
if (enablePositionIncrements) {
mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
@@ -277,9 +296,9 @@ public class MatchQuery {
if (enablePositionIncrements) {
position += positionIncrement;
//LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
- pq.add(new Term(field, termToByteRef(termAtt, new BytesRef())), position);
+ pq.add(new Term(field, termToByteRef(termAtt)), position);
} else {
- pq.add(new Term(field, termToByteRef(termAtt, new BytesRef())));
+ pq.add(new Term(field, termToByteRef(termAtt)));
}
}
return wrapSmartNameQuery(pq, smartNameFieldMappers, parseContext);
@@ -305,8 +324,7 @@ public class MatchQuery {
multiTerms.clear();
}
position += positionIncrement;
- //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
- multiTerms.add(new Term(field, termToByteRef(termAtt, new BytesRef())));
+ multiTerms.add(new Term(field, termToByteRef(termAtt)));
}
if (enablePositionIncrements) {
mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
@@ -343,8 +361,9 @@ public class MatchQuery {
}
return new TermQuery(term);
}
-
- private static BytesRef termToByteRef(CharTermAttribute attr, BytesRef ref) {
+
+ private static BytesRef termToByteRef(CharTermAttribute attr) {
+ final BytesRef ref = new BytesRef();
UnicodeUtil.UTF16toUTF8(attr.buffer(), 0, attr.length(), ref);
return ref;
}
diff --git a/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java b/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java
index 4a9edd6a5d2..c118893ec5a 100644
--- a/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java
+++ b/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java
@@ -344,7 +344,7 @@ public class SearchServiceTransportAction extends AbstractComponent {
try {
FetchSearchResult result = searchService.executeFetchPhase(request);
listener.onResult(result);
- } catch (Exception e) {
+ } catch (Throwable e) {
listener.onFailure(e);
}
} else {
@@ -378,7 +378,7 @@ public class SearchServiceTransportAction extends AbstractComponent {
try {
QuerySearchResult result = searchService.executeScan(request);
listener.onResult(result);
- } catch (Exception e) {
+ } catch (Throwable e) {
listener.onFailure(e);
}
} else {
diff --git a/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java b/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java
index 24b71a692bb..44a7ce11097 100644
--- a/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java
+++ b/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java
@@ -20,9 +20,14 @@
package org.elasticsearch.search.highlight;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.XCommonTermsQuery;
+import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
@@ -31,6 +36,7 @@ import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
public final class CustomQueryScorer extends QueryScorer {
@@ -97,6 +103,14 @@ public final class CustomQueryScorer extends QueryScorer {
} else if (query instanceof XFilteredQuery) {
query = ((XFilteredQuery) query).getQuery();
extract(query, terms);
+ } else if (query instanceof XCommonTermsQuery) {
+ XCommonTermsQuery ctq = ((XCommonTermsQuery)query);
+ List ctqTerms = ctq.terms();
+ BooleanQuery bq = new BooleanQuery();
+ for (Term term : ctqTerms) {
+ bq.add(new TermQuery(term), Occur.SHOULD);
+ }
+ extract(bq, terms);
}
}
diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
index a06a43649e1..24edaef385b 100644
--- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
@@ -947,6 +947,59 @@ public class HighlighterSearchTests extends AbstractNodesTests {
assertThat(response.hits().hits()[0].highlightFields().get("tags").fragments()[0].string(), equalTo("this is a really long tag i would like to highlight"));
assertThat(response.hits().hits()[0].highlightFields().get("tags").fragments()[1].string(), equalTo("here is another one that is very long and has the tag token near the end"));
}
+
+ @Test
+ public void testCommonTermsQuery() {
+ try {
+ client.admin().indices().prepareDelete("test").execute().actionGet();
+ } catch (IndexMissingException e) {
+ // its ok
+ }
+ client.admin().indices().prepareCreate("test").execute().actionGet();
+ client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().execute().actionGet();
+
+ client.prepareIndex("test", "type1")
+ .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog")
+ .setRefresh(true).execute().actionGet();
+
+ logger.info("--> highlighting and searching on field1");
+ SearchSourceBuilder source = searchSource()
+ .query(commonTerms("field2", "quick brown").cutoffFrequency(100))
+ .from(0).size(60).explain(true)
+ .highlight(highlight().field("field2").order("score").preTags("").postTags(""));
+
+ SearchResponse searchResponse = client.search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH).scroll(timeValueMinutes(10))).actionGet();
+ assertThat("Failures " + Arrays.toString(searchResponse.shardFailures()), searchResponse.shardFailures().length, equalTo(0));
+ assertThat(searchResponse.hits().totalHits(), equalTo(1l));
+
+ assertThat(searchResponse.hits().getAt(0).highlightFields().get("field2").fragments()[0].string(), equalTo("The quick brown fox jumps over the lazy dog"));
+ }
+
+ @Test
+ public void testCommonTermsTermVector() throws ElasticSearchException, IOException {
+ try {
+ client.admin().indices().prepareDelete("test").execute().actionGet();
+ } catch (Exception e) {
+ // ignore
+ }
+ client.admin().indices().prepareCreate("test").addMapping("type1", type1TermVectorMapping()).execute().actionGet();
+ client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().execute().actionGet();
+
+ client.prepareIndex("test", "type1").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog")
+ .setRefresh(true).execute().actionGet();
+
+ logger.info("--> highlighting and searching on field1");
+ SearchSourceBuilder source = searchSource().query(commonTerms("field2", "quick brown").cutoffFrequency(100)).from(0).size(60)
+ .explain(true).highlight(highlight().field("field2").order("score").preTags("").postTags(""));
+
+ SearchResponse searchResponse = client.search(
+ searchRequest("test").source(source).searchType(QUERY_THEN_FETCH).scroll(timeValueMinutes(10))).actionGet();
+ assertThat("Failures " + Arrays.toString(searchResponse.shardFailures()), searchResponse.shardFailures().length, equalTo(0));
+ assertThat(searchResponse.hits().totalHits(), equalTo(1l));
+
+ assertThat(searchResponse.hits().getAt(0).highlightFields().get("field2").fragments()[0].string(),
+ equalTo("The quick brown fox jumps over the lazy dog"));
+ }
@Test
public void testPlainHighlightDifferentFragmenter() throws Exception {
diff --git a/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java b/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java
index f0fe9fde574..26aecb35756 100644
--- a/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java
@@ -120,7 +120,7 @@ public class SimpleQueryTests extends AbstractNodesTests {
client.prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox").execute().actionGet();
client.prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree").execute().actionGet();
- client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown").setRefresh(true).execute().actionGet();
+ client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown", "field2", "the quick lazy huge brown fox jumps over the tree").setRefresh(true).execute().actionGet();
SearchResponse searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3)).execute().actionGet();
assertThat(searchResponse.hits().totalHits(), equalTo(2l));
@@ -140,6 +140,32 @@ public class SimpleQueryTests extends AbstractNodesTests {
assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3"));
assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
+
+ // try the same with match query
+ searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).execute().actionGet();
+ assertThat(searchResponse.hits().totalHits(), equalTo(2l));
+ assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
+ assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
+
+ searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.OR)).execute().actionGet();
+ assertThat(searchResponse.hits().totalHits(), equalTo(3l));
+ assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
+ assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
+ assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3"));
+
+ searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("standard")).execute().actionGet();
+ assertThat(searchResponse.hits().totalHits(), equalTo(3l));
+ // standard drops "the" since its a stopword
+ assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
+ assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3"));
+ assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
+
+ // try the same with multi match query
+ searchResponse = client.prepareSearch().setQuery(QueryBuilders.multiMatchQuery("the quick brown", "field1", "field2").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).execute().actionGet();
+ assertThat(searchResponse.hits().totalHits(), equalTo(3l));
+ assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("3")); // better score due to different query stats
+ assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("1"));
+ assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
}
@Test