Expose CommonTermsQuery in Match & MultiMatch and enable highlighting

Closes #2591
2013-01-25 17:44:38 +01:00 · 2013-01-25 17:44:38 +01:00 · 48488f707f
parent bfdf8fe590
commit 48488f707f
14 changed files with 627 additions and 38 deletions
--- a/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java
+++ b/src/main/java/org/apache/lucene/index/memory/ReusableMemoryIndex.java
@ -1,6 +1,5 @@
 package org.apache.lucene.index.memory;

-import org.apache.lucene.search.IndexSearcher;
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
--- a/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java
+++ b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java
@ -0,0 +1,52 @@
+package org.apache.lucene.queries;
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.elasticsearch.common.lucene.search.Queries;
+
+/**
+ * Extended version of {@link CommonTermsQuery} that allows to pass in a 
+ * <tt>minimumNumberShouldMatch</tt> specification that uses the actual num of high frequent terms
+ * to calculate the minimum matching terms.
+ */
+public class ExtendedCommonTermsQuery extends XCommonTermsQuery {
+
+    public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, boolean disableCoord) {
+        super(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoord);
+    }
+
+    public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency) {
+        super(highFreqOccur, lowFreqOccur, maxTermFrequency);
+    }
+    
+    private String minNumShouldMatchSpec;
+
+    @Override
+    protected int getMinimumNumberShouldMatch(int numOptional) {
+        if (minNumShouldMatchSpec == null) {
+            return 0;
+        }
+        return Queries.calculateMinShouldMatch(numOptional, minNumShouldMatchSpec);
+    }
+    
+    public void setMinimumNumberShouldMatch(String spec) {
+        this.minNumShouldMatchSpec = spec;
+    }
+
+}
--- a/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java
+++ b/src/main/java/org/apache/lucene/queries/XCommonTermsQuery.java
@ -0,0 +1,381 @@
+package org.apache.lucene.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.ToStringUtils;
+
+/**
+ * A query that executes high-frequency terms in a optional sub-query to prevent
+ * slow queries due to "common" terms like stopwords. This query basically
+ * builds 2 queries off the {@link #add(Term) added} terms where low-frequency
+ * terms are added to a required boolean clause and high-frequency terms are
+ * added to an optional boolean clause. The optional clause is only executed if
+ * the required "low-frequency' clause matches. Scores produced by this query
+ * will be slightly different to plain {@link BooleanQuery} scorer mainly due to
+ * differences in the {@link Similarity#coord(int,int) number of leave queries}
+ * in the required boolean clause. In the most cases high-frequency terms are
+ * unlikely to significantly contribute to the document score unless at least
+ * one of the low-frequency terms are matched such that this query can improve
+ * query execution times significantly if applicable.
+ * <p>
+ * {@link XCommonTermsQuery} has several advantages over stopword filtering at
+ * index or query time since a term can be "classified" based on the actual
+ * document frequency in the index and can prevent slow queries even across
+ * domains without specialized stopword files.
+ * </p>
+ * <p>
+ * <b>Note:</b> if the query only contains high-frequency terms the query is
+ * rewritten into a plain conjunction query ie. all high-frequency terms need to
+ * match in order to match a document.
+ * </p>
+ */
+//LUCENE MONITOR - Copied from CommonTermsQuery changes are tracked with //CHANGE
+public class XCommonTermsQuery extends Query {
+  /*
+   * TODO maybe it would make sense to abstract this even further and allow to
+   * rewrite to dismax rather than boolean. Yet, this can already be subclassed
+   * to do so.
+   */
+  protected final List<Term> terms = new ArrayList<Term>();
+  protected final boolean disableCoord;
+  protected final float maxTermFrequency;
+  protected final Occur lowFreqOccur;
+  protected final Occur highFreqOccur;
+  protected float lowFreqBoost = 1.0f;
+  protected float highFreqBoost = 1.0f;
+  //CHANGE made minNr... a float for fractions
+  protected float minNrShouldMatch = 0;
+  
+  /**
+   * Creates a new {@link XCommonTermsQuery}
+   * 
+   * @param highFreqOccur
+   *          {@link Occur} used for high frequency terms
+   * @param lowFreqOccur
+   *          {@link Occur} used for low frequency terms
+   * @param maxTermFrequency
+   *          a value in [0..1] (or absolute number >=1) representing the
+   *          maximum threshold of a terms document frequency to be considered a
+   *          low frequency term.
+   * @throws IllegalArgumentException
+   *           if {@link Occur#MUST_NOT} is pass as lowFreqOccur or
+   *           highFreqOccur
+   */
+  public XCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur,
+      float maxTermFrequency) {
+    this(highFreqOccur, lowFreqOccur, maxTermFrequency, false);
+  }
+  
+  /**
+   * Creates a new {@link XCommonTermsQuery}
+   * 
+   * @param highFreqOccur
+   *          {@link Occur} used for high frequency terms
+   * @param lowFreqOccur
+   *          {@link Occur} used for low frequency terms
+   * @param maxTermFrequency
+   *          a value in [0..1] (or absolute number >=1) representing the
+   *          maximum threshold of a terms document frequency to be considered a
+   *          low frequency term.
+   * @param disableCoord
+   *          disables {@link Similarity#coord(int,int)} in scoring for the low
+   *          / high frequency sub-queries
+   * @throws IllegalArgumentException
+   *           if {@link Occur#MUST_NOT} is pass as lowFreqOccur or
+   *           highFreqOccur
+   */
+  public XCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur,
+      float maxTermFrequency, boolean disableCoord) {
+    if (highFreqOccur == Occur.MUST_NOT) {
+      throw new IllegalArgumentException(
+          "highFreqOccur should be MUST or SHOULD but was MUST_NOT");
+    }
+    if (lowFreqOccur == Occur.MUST_NOT) {
+      throw new IllegalArgumentException(
+          "lowFreqOccur should be MUST or SHOULD but was MUST_NOT");
+    }
+    this.disableCoord = disableCoord;
+    this.highFreqOccur = highFreqOccur;
+    this.lowFreqOccur = lowFreqOccur;
+    this.maxTermFrequency = maxTermFrequency;
+  }
+  
+  /**
+   * Adds a term to the {@link CommonTermsQuery}
+   * 
+   * @param term
+   *          the term to add
+   */
+  public void add(Term term) {
+    if (term == null) {
+      throw new IllegalArgumentException("Term must not be null");
+    }
+    this.terms.add(term);
+  }
+  
+  @Override
+  public Query rewrite(IndexReader reader) throws IOException {
+    if (this.terms.isEmpty()) {
+      return new BooleanQuery();
+    } else if (this.terms.size() == 1) {
+      final TermQuery tq = new TermQuery(this.terms.get(0));
+      tq.setBoost(getBoost());
+      return tq;
+    }
+    final List<AtomicReaderContext> leaves = reader.leaves();
+    final int maxDoc = reader.maxDoc();
+    final TermContext[] contextArray = new TermContext[terms.size()];
+    final Term[] queryTerms = this.terms.toArray(new Term[0]);
+    collectTermContext(reader, leaves, contextArray, queryTerms);
+    return buildQuery(maxDoc, contextArray, queryTerms);
+  }
+  
+  //CHANGE added to get num optional
+  protected int getMinimumNumberShouldMatch(int numOptional) {
+      if (minNrShouldMatch >= 1.0f) {
+          return (int) minNrShouldMatch;
+      }
+      return (int) (minNrShouldMatch * numOptional);
+  }
+  
+  protected Query buildQuery(final int maxDoc,
+      final TermContext[] contextArray, final Term[] queryTerms) {
+    BooleanQuery lowFreq = new BooleanQuery(disableCoord);
+    BooleanQuery highFreq = new BooleanQuery(disableCoord);
+    highFreq.setBoost(highFreqBoost);
+    lowFreq.setBoost(lowFreqBoost);
+    
+    BooleanQuery query = new BooleanQuery(true);
+    
+    for (int i = 0; i < queryTerms.length; i++) {
+      TermContext termContext = contextArray[i];
+      if (termContext == null) {
+        lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur);
+      } else {
+        if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency)
+            || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency
+                * (float) maxDoc))) {
+          highFreq
+              .add(new TermQuery(queryTerms[i], termContext), highFreqOccur);
+        } else {
+          lowFreq.add(new TermQuery(queryTerms[i], termContext), lowFreqOccur);
+        }
+      }
+      
+    }
+    if (lowFreqOccur == Occur.SHOULD) {
+        lowFreq.setMinimumNumberShouldMatch(getMinimumNumberShouldMatch(lowFreq.clauses().size()));
+    }
+    if (lowFreq.clauses().isEmpty()) {
+      /*
+       * if lowFreq is empty we rewrite the high freq terms in a conjunction to
+       * prevent slow queries.
+       */
+      if (highFreqOccur == Occur.MUST) {
+        highFreq.setBoost(getBoost());
+        return highFreq;
+      } else {
+        BooleanQuery highFreqConjunction = new BooleanQuery();
+        for (BooleanClause booleanClause : highFreq) {
+          highFreqConjunction.add(booleanClause.getQuery(), Occur.MUST);
+        }
+        highFreqConjunction.setBoost(getBoost());
+        return highFreqConjunction;
+        
+      }
+    } else if (highFreq.clauses().isEmpty()) {
+      // only do low freq terms - we don't have high freq terms
+      lowFreq.setBoost(getBoost());
+      return lowFreq;
+    } else {
+      query.add(highFreq, Occur.SHOULD);
+      query.add(lowFreq, Occur.MUST);
+      query.setBoost(getBoost());
+      return query;
+    }
+  }
+  
+  public void collectTermContext(IndexReader reader,
+      List<AtomicReaderContext> leaves, TermContext[] contextArray,
+      Term[] queryTerms) throws IOException {
+    TermsEnum termsEnum = null;
+    for (AtomicReaderContext context : leaves) {
+      final Fields fields = context.reader().fields();
+      if (fields == null) {
+        // reader has no fields
+        continue;
+      }
+      for (int i = 0; i < queryTerms.length; i++) {
+        Term term = queryTerms[i];
+        TermContext termContext = contextArray[i];
+        final Terms terms = fields.terms(term.field());
+        if (terms == null) {
+          // field does not exist
+          continue;
+        }
+        termsEnum = terms.iterator(termsEnum);
+        assert termsEnum != null;
+        
+        if (termsEnum == TermsEnum.EMPTY) continue;
+        if (termsEnum.seekExact(term.bytes(), false)) {
+          if (termContext == null) {
+            contextArray[i] = new TermContext(reader.getContext(),
+                termsEnum.termState(), context.ord, termsEnum.docFreq(),
+                termsEnum.totalTermFreq());
+          } else {
+            termContext.register(termsEnum.termState(), context.ord,
+                termsEnum.docFreq(), termsEnum.totalTermFreq());
+          }
+          
+        }
+        
+      }
+    }
+  }
+  
+  /**
+   * Returns true iff {@link Similarity#coord(int,int)} is disabled in scoring
+   * for the high and low frequency query instance. The top level query will
+   * always disable coords.
+   */
+  public boolean isCoordDisabled() {
+    return disableCoord;
+  }
+  
+  /**
+   * Specifies a minimum number of the optional BooleanClauses which must be
+   * satisfied in order to produce a match on the low frequency terms query
+   * part.
+   * 
+   * <p>
+   * By default no optional clauses are necessary for a match (unless there are
+   * no required clauses). If this method is used, then the specified number of
+   * clauses is required.
+   * </p>
+   * 
+   * @param min
+   *          the number of optional clauses that must match
+   */
+  //CHANGE accepts now a float
+  public void setMinimumNumberShouldMatch(float min) {
+    this.minNrShouldMatch = min;
+  }
+  
+  /**
+   * Gets the minimum number of the optional BooleanClauses which must be
+   * satisfied.
+   */
+  //CHANGE returns now a float
+  public float getMinimumNumberShouldMatch() {
+    return minNrShouldMatch;
+  }
+  
+  @Override
+  public void extractTerms(Set<Term> terms) {
+    terms.addAll(this.terms);
+  }
+  
+  @Override
+  public String toString(String field) {
+    StringBuilder buffer = new StringBuilder();
+    boolean needParens = (getBoost() != 1.0)
+        || (getMinimumNumberShouldMatch() > 0);
+    if (needParens) {
+      buffer.append("(");
+    }
+    for (int i = 0; i < terms.size(); i++) {
+      Term t = terms.get(i);
+      buffer.append(new TermQuery(t).toString());
+      
+      if (i != terms.size() - 1) buffer.append(", ");
+    }
+    if (needParens) {
+      buffer.append(")");
+    }
+    if (getMinimumNumberShouldMatch() > 0) {
+      buffer.append('~');
+      buffer.append(getMinimumNumberShouldMatch());
+    }
+    if (getBoost() != 1.0f) {
+      buffer.append(ToStringUtils.boost(getBoost()));
+    }
+    return buffer.toString();
+  }
+  
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = super.hashCode();
+    result = prime * result + (disableCoord ? 1231 : 1237);
+    result = prime * result + Float.floatToIntBits(highFreqBoost);
+    result = prime * result
+        + ((highFreqOccur == null) ? 0 : highFreqOccur.hashCode());
+    result = prime * result + Float.floatToIntBits(lowFreqBoost);
+    result = prime * result
+        + ((lowFreqOccur == null) ? 0 : lowFreqOccur.hashCode());
+    result = prime * result + Float.floatToIntBits(maxTermFrequency);
+    result = prime * result + Float.floatToIntBits(minNrShouldMatch);
+    result = prime * result + ((terms == null) ? 0 : terms.hashCode());
+    return result;
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (!super.equals(obj)) return false;
+    if (getClass() != obj.getClass()) return false;
+    XCommonTermsQuery other = (XCommonTermsQuery) obj;
+    if (disableCoord != other.disableCoord) return false;
+    if (Float.floatToIntBits(highFreqBoost) != Float
+        .floatToIntBits(other.highFreqBoost)) return false;
+    if (highFreqOccur != other.highFreqOccur) return false;
+    if (Float.floatToIntBits(lowFreqBoost) != Float
+        .floatToIntBits(other.lowFreqBoost)) return false;
+    if (lowFreqOccur != other.lowFreqOccur) return false;
+    if (Float.floatToIntBits(maxTermFrequency) != Float
+        .floatToIntBits(other.maxTermFrequency)) return false;
+    if (minNrShouldMatch != other.minNrShouldMatch) return false;
+    if (terms == null) {
+      if (other.terms != null) return false;
+    } else if (!terms.equals(other.terms)) return false;
+    return true;
+  }
+  
+  //CHANGE added
+  public List<Term> terms() {
+      return this.terms;
+  }
+  
+}
--- a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java
+++ b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java
@ -20,6 +20,7 @@
 package org.apache.lucene.search.vectorhighlight;

 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
 import org.apache.lucene.queries.FilterClause;
 import org.apache.lucene.search.*;
 import org.apache.lucene.search.spans.SpanTermQuery;
@ -97,6 +98,8 @@ public class CustomFieldQuery extends FieldQuery {
            }
        } else if (sourceQuery instanceof FiltersFunctionScoreQuery) {
            flatten(((FiltersFunctionScoreQuery) sourceQuery).getSubQuery(), reader, flatQueries);
+        } else if (sourceQuery instanceof ExtendedCommonTermsQuery) {
+            flatten(((ExtendedCommonTermsQuery)sourceQuery).rewrite(reader), reader, flatQueries);
        } else {
            super.flatten(sourceQuery, reader, flatQueries);
        } 
--- a/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java
+++ b/src/main/java/org/elasticsearch/index/query/CommonTermsQueryParser.java
@ -19,6 +19,8 @@

 package org.elasticsearch.index.query;

+import static org.elasticsearch.index.query.support.QueryParsers.wrapSmartNameQuery;
+
 import java.io.IOException;

 import org.apache.lucene.analysis.Analyzer;
@ -26,6 +28,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.Query;
@ -143,20 +146,15 @@ public class CommonTermsQueryParser implements QueryParser {
        if (value == null) {
            throw new QueryParsingException(parseContext.index(), "No text specified for text query");
        }
-        CommonTermsQuery query = new CommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
-        int numTerms = parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer);
-        if (numTerms == 0) {
-            return null;
-        }
-        if (minimumShouldMatch != null) {
-            query.setMinimumNumberShouldMatch(Queries.calculateMinShouldMatch(numTerms, minimumShouldMatch));
-        }
+        ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
        query.setBoost(boost);
-        return query;
+        return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, minimumShouldMatch);
    }
        
-    private final int parseQueryString(CommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
-            String queryAnalyzer) throws IOException {
+
+    private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String fieldName, QueryParseContext parseContext,
+            String queryAnalyzer, String minimumShouldMatch) throws IOException {
+    
        FieldMapper<?> mapper = null;
        String field;
        MapperService.SmartNameFieldMappers smartNameFieldMappers = parseContext.smartFieldMappers(fieldName);
@ -197,7 +195,11 @@ public class CommonTermsQueryParser implements QueryParser {
            query.add(new Term(field, ref));
            count++;
        }
-        return count;
        
+        if (count == 0) {
+            return null;
+        }
+        query.setMinimumNumberShouldMatch(minimumShouldMatch);
+        return wrapSmartNameQuery(query, smartNameFieldMappers, parseContext);
    }
 }
--- a/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java
+++ b/src/main/java/org/elasticsearch/index/query/MatchQueryBuilder.java
@ -87,6 +87,8 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer

    private ZeroTermsQuery zeroTermsQuery;
    
+    private Float cutoff_Frequency = null;
+
    /**
     * Constructs a new text query.
     */
@ -158,6 +160,16 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer
        return this;
    }
    
+    /**
+     * Set a cutoff value in [0..1] (or absolute number >=1) representing the
+     * maximum threshold of a terms document frequency to be considered a low
+     * frequency term.
+     */
+    public MatchQueryBuilder cutoffFrequency(float cutoff) {
+        this.cutoff_Frequency = cutoff;
+        return this;
+    }
+
    public MatchQueryBuilder minimumShouldMatch(String minimumShouldMatch) {
        this.minimumShouldMatch = minimumShouldMatch;
        return this;
@ -241,6 +253,10 @@ public class MatchQueryBuilder extends BaseQueryBuilder implements BoostableQuer
        if (zeroTermsQuery != null) {
            builder.field("zero_terms_query", zeroTermsQuery.toString());
        }
+        if (cutoff_Frequency != null) {
+            builder.field("cutoff_frequency", cutoff_Frequency);
+        }
+        

        builder.endObject();
        builder.endObject();
--- a/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java
+++ b/src/main/java/org/elasticsearch/index/query/MatchQueryParser.java
@ -19,6 +19,8 @@

 package org.elasticsearch.index.query;

+import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
@ -126,6 +128,8 @@ public class MatchQueryParser implements QueryParser {
                        matchQuery.setTranspositions(parser.booleanValue());
                    } else if ("lenient".equals(currentFieldName)) {
                        matchQuery.setLenient(parser.booleanValue());
+                    } else if ("cutoff_frequency".equals(currentFieldName)) {
+                        matchQuery.setCommonTermsCutoff(parser.floatValue());
                    } else if ("zero_terms_query".equals(currentFieldName)) {
                        String zeroTermsDocs = parser.text();
                        if ("none".equalsIgnoreCase(zeroTermsDocs)) {
@ -161,8 +165,9 @@ public class MatchQueryParser implements QueryParser {

        if (query instanceof BooleanQuery) {
            Queries.applyMinimumShouldMatch((BooleanQuery) query, minimumShouldMatch);
+        } else if (query instanceof ExtendedCommonTermsQuery) {
+            ((ExtendedCommonTermsQuery)query).setMinimumNumberShouldMatch(minimumShouldMatch);
        }
-
        query.setBoost(boost);
        return query;
    }
--- a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java
+++ b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryBuilder.java
@ -67,6 +67,8 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl

    private Boolean lenient;

+    private Float cutoffFrequency = null;
+    
    /**
     * Constructs a new text query.
     */
@ -192,6 +194,17 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl
        return this;
    }
    
+    
+    /**
+     * Set a cutoff value in [0..1] (or absolute number >=1) representing the
+     * maximum threshold of a terms document frequency to be considered a low
+     * frequency term.
+     */
+    public MultiMatchQueryBuilder cutoffFrequency(float cutoff) {
+        this.cutoffFrequency = cutoff;
+        return this;
+    }
+
    @Override
    public void doXContent(XContentBuilder builder, Params params) throws IOException {
        builder.startObject(MultiMatchQueryParser.NAME);
@ -256,6 +269,10 @@ public class MultiMatchQueryBuilder extends BaseQueryBuilder implements Boostabl
            builder.field("lenient", lenient);
        }
        
+        if (cutoffFrequency != null) {
+            builder.field("cutoff_frequency", cutoffFrequency);
+        }
+
        builder.endObject();
    }
 }
--- a/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java
+++ b/src/main/java/org/elasticsearch/index/query/MultiMatchQueryParser.java
@ -145,6 +145,8 @@ public class MultiMatchQueryParser implements QueryParser {
                    multiMatchQuery.setUseDisMax(parser.booleanValue());
                } else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) {
                    multiMatchQuery.setTieBreaker(parser.floatValue());
+                }  else if ("cutoff_frequency".equals(currentFieldName)) {
+                    multiMatchQuery.setCommonTermsCutoff(parser.floatValue());
                } else if ("lenient".equals(currentFieldName)) {
                    multiMatchQuery.setLenient(parser.booleanValue());
                } else {
--- a/src/main/java/org/elasticsearch/index/search/MatchQuery.java
+++ b/src/main/java/org/elasticsearch/index/search/MatchQuery.java
@ -25,6 +25,8 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.queries.ExtendedCommonTermsQuery;
 import org.apache.lucene.search.*;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnicodeUtil;
@ -70,19 +72,24 @@ public class MatchQuery {
    protected int phraseSlop = 0;

    protected String fuzziness = null;
+    
    protected int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
+    
    protected int maxExpansions = FuzzyQuery.defaultMaxExpansions;
+    
    //LUCENE 4 UPGRADE we need a default value for this!
    protected boolean transpositions = false;

-
    protected MultiTermQuery.RewriteMethod rewriteMethod;
+
    protected MultiTermQuery.RewriteMethod fuzzyRewriteMethod;

    protected boolean lenient;

    protected ZeroTermsQuery zeroTermsQuery = ZeroTermsQuery.NONE;
    
+    protected Float commonTermsCutoff = null;
+    
    public MatchQuery(QueryParseContext parseContext) {
        this.parseContext = parseContext;
    }
@ -95,6 +102,10 @@ public class MatchQuery {
        this.occur = occur;
    }
    
+    public void setCommonTermsCutoff(float cutoff) {
+        this.commonTermsCutoff = Float.valueOf(cutoff);
+    }
+
    public void setEnablePositionIncrements(boolean enablePositionIncrements) {
        this.enablePositionIncrements = enablePositionIncrements;
    }
@ -221,19 +232,27 @@ public class MatchQuery {
            if (numTokens == 1) {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
-                //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
-                final Query q = newTermQuery(mapper, new Term(field, termToByteRef(termAtt, new BytesRef())));
+                final Query q = newTermQuery(mapper, new Term(field, termToByteRef(termAtt)));
                return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
            }
+            if (commonTermsCutoff != null) {
+                ExtendedCommonTermsQuery q = new ExtendedCommonTermsQuery(occur, occur, commonTermsCutoff, positionCount == 1);
+                for (int i = 0; i < numTokens; i++) {
+                    boolean hasNext = buffer.incrementToken();
+                    assert hasNext == true;
+                    q.add(new Term(field, termToByteRef(termAtt)));
+                }
+                return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
+            } else {
                BooleanQuery q = new BooleanQuery(positionCount == 1);
                for (int i = 0; i < numTokens; i++) {
                    boolean hasNext = buffer.incrementToken();
                    assert hasNext == true;
-                //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
-                final Query currentQuery = newTermQuery(mapper, new Term(field, termToByteRef(termAtt, new BytesRef())));
+                    final Query currentQuery = newTermQuery(mapper, new Term(field, termToByteRef(termAtt)));
                    q.add(currentQuery, occur);
                }
                return wrapSmartNameQuery(q, smartNameFieldMappers, parseContext);
+            }
        } else if (type == Type.PHRASE) {
            if (severalTokensAtSamePosition) {
                final MultiPhraseQuery mpq = new MultiPhraseQuery();
@ -256,7 +275,7 @@ public class MatchQuery {
                    }
                    position += positionIncrement;
                    //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8 
-                    multiTerms.add(new Term(field, termToByteRef(termAtt, new BytesRef())));
+                    multiTerms.add(new Term(field, termToByteRef(termAtt)));
                }
                if (enablePositionIncrements) {
                    mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
@ -277,9 +296,9 @@ public class MatchQuery {
                    if (enablePositionIncrements) {
                        position += positionIncrement;
                        //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
-                        pq.add(new Term(field, termToByteRef(termAtt, new BytesRef())), position);
+                        pq.add(new Term(field, termToByteRef(termAtt)), position);
                    } else {
-                        pq.add(new Term(field, termToByteRef(termAtt, new BytesRef())));
+                        pq.add(new Term(field, termToByteRef(termAtt)));
                    }
                }
                return wrapSmartNameQuery(pq, smartNameFieldMappers, parseContext);
@ -305,8 +324,7 @@ public class MatchQuery {
                    multiTerms.clear();
                }
                position += positionIncrement;
-                //LUCENE 4 UPGRADE instead of string term we can convert directly from utf-16 to utf-8
-                multiTerms.add(new Term(field, termToByteRef(termAtt, new BytesRef())));
+                multiTerms.add(new Term(field, termToByteRef(termAtt)));
            }
            if (enablePositionIncrements) {
                mpq.add(multiTerms.toArray(new Term[multiTerms.size()]), position);
@ -344,7 +362,8 @@ public class MatchQuery {
        return new TermQuery(term);
    }
    
-    private static BytesRef termToByteRef(CharTermAttribute attr, BytesRef ref) {
+    private static BytesRef termToByteRef(CharTermAttribute attr) {
+        final BytesRef ref = new BytesRef();
        UnicodeUtil.UTF16toUTF8(attr.buffer(), 0, attr.length(), ref);
        return ref;
    }
--- a/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java
+++ b/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java
@ -344,7 +344,7 @@ public class SearchServiceTransportAction extends AbstractComponent {
            try {
                FetchSearchResult result = searchService.executeFetchPhase(request);
                listener.onResult(result);
-            } catch (Exception e) {
+            } catch (Throwable e) {
                listener.onFailure(e);
            }
        } else {
@ -378,7 +378,7 @@ public class SearchServiceTransportAction extends AbstractComponent {
            try {
                QuerySearchResult result = searchService.executeScan(request);
                listener.onResult(result);
-            } catch (Exception e) {
+            } catch (Throwable e) {
                listener.onFailure(e);
            }
        } else {
--- a/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java
+++ b/src/main/java/org/elasticsearch/search/highlight/CustomQueryScorer.java
@ -20,9 +20,14 @@
 package org.elasticsearch.search.highlight;

 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.XCommonTermsQuery;
+import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.FilteredQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.WeightedSpanTerm;
 import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
@ -31,6 +36,7 @@ import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery
 import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;

 import java.io.IOException;
+import java.util.List;
 import java.util.Map;

 public final class CustomQueryScorer extends QueryScorer {
@ -97,6 +103,14 @@ public final class CustomQueryScorer extends QueryScorer {
            } else if (query instanceof XFilteredQuery) {
                query = ((XFilteredQuery) query).getQuery();
                extract(query, terms);
+            } else if (query instanceof XCommonTermsQuery) {
+                XCommonTermsQuery ctq = ((XCommonTermsQuery)query);
+                List<Term> ctqTerms = ctq.terms();
+                BooleanQuery bq = new BooleanQuery();
+                for (Term term : ctqTerms) {
+                    bq.add(new TermQuery(term), Occur.SHOULD);    
+                }
+                extract(bq, terms);
            }
        }

--- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
@ -948,6 +948,59 @@ public class HighlighterSearchTests extends AbstractNodesTests {
        assertThat(response.hits().hits()[0].highlightFields().get("tags").fragments()[1].string(), equalTo("here is another one that is very long and has the <em>tag</em> token near the end"));
    }
    
+    @Test
+    public void testCommonTermsQuery() {
+        try {
+            client.admin().indices().prepareDelete("test").execute().actionGet();
+        } catch (IndexMissingException e) {
+            // its ok
+        }
+        client.admin().indices().prepareCreate("test").execute().actionGet();
+        client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().execute().actionGet();
+
+        client.prepareIndex("test", "type1")
+                .setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog")
+                .setRefresh(true).execute().actionGet();
+
+        logger.info("--> highlighting and searching on field1");
+        SearchSourceBuilder source = searchSource()
+                .query(commonTerms("field2", "quick brown").cutoffFrequency(100))
+                .from(0).size(60).explain(true)
+                .highlight(highlight().field("field2").order("score").preTags("<x>").postTags("</x>"));
+
+        SearchResponse searchResponse = client.search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH).scroll(timeValueMinutes(10))).actionGet();
+        assertThat("Failures " + Arrays.toString(searchResponse.shardFailures()), searchResponse.shardFailures().length, equalTo(0));
+        assertThat(searchResponse.hits().totalHits(), equalTo(1l));
+
+        assertThat(searchResponse.hits().getAt(0).highlightFields().get("field2").fragments()[0].string(), equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
+    }
+    
+    @Test
+    public void testCommonTermsTermVector() throws ElasticSearchException, IOException {
+        try {
+            client.admin().indices().prepareDelete("test").execute().actionGet();
+        } catch (Exception e) {
+            // ignore
+        }
+        client.admin().indices().prepareCreate("test").addMapping("type1", type1TermVectorMapping()).execute().actionGet();
+        client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().execute().actionGet();
+
+        client.prepareIndex("test", "type1").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog")
+                .setRefresh(true).execute().actionGet();
+
+        logger.info("--> highlighting and searching on field1");
+        SearchSourceBuilder source = searchSource().query(commonTerms("field2", "quick brown").cutoffFrequency(100)).from(0).size(60)
+                .explain(true).highlight(highlight().field("field2").order("score").preTags("<x>").postTags("</x>"));
+
+        SearchResponse searchResponse = client.search(
+                searchRequest("test").source(source).searchType(QUERY_THEN_FETCH).scroll(timeValueMinutes(10))).actionGet();
+        assertThat("Failures " + Arrays.toString(searchResponse.shardFailures()), searchResponse.shardFailures().length, equalTo(0));
+        assertThat(searchResponse.hits().totalHits(), equalTo(1l));
+
+        assertThat(searchResponse.hits().getAt(0).highlightFields().get("field2").fragments()[0].string(),
+                equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
+    }
+
    @Test
    public void testPlainHighlightDifferentFragmenter() throws Exception {
        try {
--- a/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/query/SimpleQueryTests.java
@ -120,7 +120,7 @@ public class SimpleQueryTests extends AbstractNodesTests {

        client.prepareIndex("test", "type1", "1").setSource("field1", "the quick brown fox").execute().actionGet();
        client.prepareIndex("test", "type1", "2").setSource("field1", "the quick lazy huge brown fox jumps over the tree").execute().actionGet();
-        client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown").setRefresh(true).execute().actionGet();
+        client.prepareIndex("test", "type1", "3").setSource("field1", "quick lazy huge brown", "field2", "the quick lazy huge brown fox jumps over the tree").setRefresh(true).execute().actionGet();

        SearchResponse searchResponse = client.prepareSearch().setQuery(QueryBuilders.commonTerms("field1", "the quick brown").cutoffFrequency(3)).execute().actionGet();
        assertThat(searchResponse.hits().totalHits(), equalTo(2l));
@ -140,6 +140,32 @@ public class SimpleQueryTests extends AbstractNodesTests {
        assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
        assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3"));
        assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
+        
+        // try the same with match query
+        searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).execute().actionGet();
+        assertThat(searchResponse.hits().totalHits(), equalTo(2l));
+        assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
+        assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
+        
+        searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.OR)).execute().actionGet();
+        assertThat(searchResponse.hits().totalHits(), equalTo(3l));
+        assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
+        assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("2"));
+        assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("3"));
+        
+        searchResponse = client.prepareSearch().setQuery(QueryBuilders.matchQuery("field1", "the quick brown").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND).analyzer("standard")).execute().actionGet();
+        assertThat(searchResponse.hits().totalHits(), equalTo(3l));
+        // standard drops "the" since its a stopword
+        assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("1"));
+        assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("3"));
+        assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
+        
+        // try the same with multi match query
+        searchResponse = client.prepareSearch().setQuery(QueryBuilders.multiMatchQuery("the quick brown", "field1", "field2").cutoffFrequency(3).operator(MatchQueryBuilder.Operator.AND)).execute().actionGet();
+        assertThat(searchResponse.hits().totalHits(), equalTo(3l));
+        assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo("3")); // better score due to different query stats
+        assertThat(searchResponse.getHits().getHits()[1].getId(), equalTo("1"));
+        assertThat(searchResponse.getHits().getHits()[2].getId(), equalTo("2"));
    }
    
    @Test