Initial commit of LUCENE-1486 - a subclass of the default QueryParser that overrides the parsing of PhraseQueries to allow more complex syntax e.g. wildcards in phrase queries

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791579 13f79535-47bb-0310-9956-ffa450edef68
2009-07-06 19:32:54 +00:00 · 2009-07-06 19:32:54 +00:00 · f7fa579971
parent 28d65ceee7
commit f7fa579971
3 changed files with 535 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -521,6 +521,12 @@ New features
    All standard parsers now also implement Serializable and enforce
    their singleton status.  (Uwe Schindler, Mike McCandless)
    
+31. LUCENE-1486: Added ComplexPhraseQueryParser, an extension of QueryParser 
+    that allows a subset of the Lucene query language to be embedded in
+    PhraseQuerys. Wildcard, Range, and Fuzzy queries, as well as limited 
+    boolean logic, can be used within quote operators with this parser, ie: 
+    "(jo* -john) smyth~". (Mark Harwood via Mark Miller)
+    
 Optimizations

 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
--- a/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java
+++ b/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java
@ -0,0 +1,386 @@
+package org.apache.lucene.queryParser;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+/**
+ * QueryParser which permits complex phrase query syntax e.g. "(john jon
+ * jonathan~) peters*"
+ * 
+ * Performs potentially multiple passes over Query text to parse any nested
+ * logic in PhraseQueries. - First pass takes any PhraseQuery content between
+ * quotes and stores for subsequent pass. All other query content is parsed as
+ * normal - Second pass parses any stored PhraseQuery content, checking all
+ * embedded clauses are referring to the same field and therefore can be
+ * rewritten as Span queries. All PhraseQuery clauses are expressed as
+ * ComplexPhraseQuery objects
+ * 
+ * This could arguably be done in one pass using a new QueryParser but here I am
+ * working within the constraints of the existing parser as a base class. This
+ * currently simply feeds all phrase content through an analyzer to select
+ * phrase terms - any "special" syntax such as * ~ * etc are not given special
+ * status
+ * 
+ * 
+ */
+public class ComplexPhraseQueryParser extends QueryParser {
+  private ArrayList/*<ComplexPhraseQuery>*/complexPhrases = null;
+
+  private boolean isPass2ResolvingPhrases;
+
+  private ComplexPhraseQuery currentPhraseQuery = null;
+
+  public ComplexPhraseQueryParser(String f, Analyzer a) {
+    super(f, a);
+  }
+
+  protected Query getFieldQuery(String field, String queryText, int slop) {
+    ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop);
+    complexPhrases.add(cpq); // add to list of phrases to be parsed once
+    // we
+    // are through with this pass
+    return cpq;
+  }
+
+  public Query parse(String query) throws ParseException {
+    if (isPass2ResolvingPhrases) {
+      boolean oldConstantScoreRewriteSetting = getConstantScoreRewrite();
+      try {
+        // Temporarily set constantScoreRewrite to false so that Parser will
+        // generate visible
+        // collection of terms which we can convert into SpanQueries.
+        // ConstantScoreRewrite mode produces an
+        // opaque ConstantScoreQuery object which cannot be interrogated for
+        // terms in the same way a BooleanQuery can.
+        // QueryParser is not guaranteed threadsafe anyway so this temporary
+        // state change should not
+        // present an issue
+        setConstantScoreRewrite(false);
+        return super.parse(query);
+      } finally {
+        setConstantScoreRewrite(oldConstantScoreRewriteSetting);
+      }
+    }
+
+    // First pass - parse the top-level query recording any PhraseQuerys
+    // which will need to be resolved
+    complexPhrases = new ArrayList/*<ComplexPhraseQuery>*/();
+    Query q = super.parse(query);
+
+    // Perform second pass, using this QueryParser to parse any nested
+    // PhraseQueries with different
+    // set of syntax restrictions (i.e. all fields must be same)
+    isPass2ResolvingPhrases = true;
+    try {
+      for (Iterator iterator = complexPhrases.iterator(); iterator.hasNext();) {
+        currentPhraseQuery = (ComplexPhraseQuery) iterator.next();
+        // in each phrase, now parse the contents between quotes as a
+        // separate parse operation
+        currentPhraseQuery.parsePhraseElements(this);
+      }
+    } finally {
+      isPass2ResolvingPhrases = false;
+    }
+    return q;
+  }
+
+  // There is No "getTermQuery throws ParseException" method to override so
+  // unfortunately need
+  // to throw a runtime exception here if a term for another field is embedded
+  // in phrase query
+  protected Query newTermQuery(Term term) {
+    if (isPass2ResolvingPhrases) {
+      try {
+        checkPhraseClauseIsForSameField(term.field());
+      } catch (ParseException pe) {
+        throw new RuntimeException("Error parsing complex phrase", pe);
+      }
+    }
+    return super.newTermQuery(term);
+  }
+
+  // Helper method used to report on any clauses that appear in query syntax
+  private void checkPhraseClauseIsForSameField(String field)
+      throws ParseException {
+    if (!field.equals(currentPhraseQuery.field)) {
+      throw new ParseException("Cannot have clause for field \"" + field
+          + "\" nested in phrase " + " for field \"" + currentPhraseQuery.field
+          + "\"");
+    }
+  }
+
+  protected Query getWildcardQuery(String field, String termStr)
+      throws ParseException {
+    if (isPass2ResolvingPhrases) {
+      checkPhraseClauseIsForSameField(field);
+    }
+    return super.getWildcardQuery(field, termStr);
+  }
+
+  protected Query getRangeQuery(String field, String part1, String part2,
+      boolean inclusive) throws ParseException {
+    if (isPass2ResolvingPhrases) {
+      checkPhraseClauseIsForSameField(field);
+    }
+    return super.getRangeQuery(field, part1, part2, inclusive);
+  }
+
+  protected Query newRangeQuery(String field, String part1, String part2,
+      boolean inclusive) {
+    if (isPass2ResolvingPhrases) {
+      // Must use old-style RangeQuery in order to produce a BooleanQuery
+      // that can be turned into SpanOr clause
+      TermRangeQuery rangeQuery = new TermRangeQuery(field, part1, part2, inclusive, inclusive,
+          getRangeCollator());
+      rangeQuery.setConstantScoreRewrite(false);;
+      return rangeQuery;
+    }
+    return super.newRangeQuery(field, part1, part2, inclusive);
+  }
+
+  protected Query getFuzzyQuery(String field, String termStr,
+      float minSimilarity) throws ParseException {
+    if (isPass2ResolvingPhrases) {
+      checkPhraseClauseIsForSameField(field);
+    }
+    return super.getFuzzyQuery(field, termStr, minSimilarity);
+  }
+
+  /*
+   * Used to handle the query content in between quotes and produced Span-based
+   * interpretations of the clauses.
+   */
+  static class ComplexPhraseQuery extends Query {
+
+    String field;
+
+    String phrasedQueryStringContents;
+
+    int slopFactor;
+
+    private Query contents;
+
+    public ComplexPhraseQuery(String field, String phrasedQueryStringContents,
+        int slopFactor) {
+      super();
+      this.field = field;
+      this.phrasedQueryStringContents = phrasedQueryStringContents;
+      this.slopFactor = slopFactor;
+    }
+
+    // Called by ComplexPhraseQueryParser for each phrase after the main
+    // parse
+    // thread is through
+    protected void parsePhraseElements(QueryParser qp) throws ParseException {
+      // TODO ensure that field-sensitivity is preserved ie the query
+      // string below is parsed as
+      // field+":("+phrasedQueryStringContents+")"
+      // but this will need code in rewrite to unwrap the first layer of
+      // boolean query
+      contents = qp.parse(phrasedQueryStringContents);
+    }
+
+    public Query rewrite(IndexReader reader) throws IOException {
+      // ArrayList spanClauses = new ArrayList();
+      if (contents instanceof TermQuery) {
+        return contents;
+      }
+      // Build a sequence of Span clauses arranged in a SpanNear - child
+      // clauses can be complex
+      // Booleans e.g. nots and ors etc
+      int numNegatives = 0;
+      if (!(contents instanceof BooleanQuery)) {
+        throw new IllegalArgumentException("Unknown query type \""
+            + contents.getClass().getName()
+            + "\" found in phrase query string \"" + phrasedQueryStringContents
+            + "\"");
+      }
+      BooleanQuery bq = (BooleanQuery) contents;
+      BooleanClause[] bclauses = bq.getClauses();
+      SpanQuery[] allSpanClauses = new SpanQuery[bclauses.length];
+      // For all clauses e.g. one* two~
+      for (int i = 0; i < bclauses.length; i++) {
+        // HashSet bclauseterms=new HashSet();
+        Query qc = bclauses[i].getQuery();
+        // Rewrite this clause e.g one* becomes (one OR onerous)
+        qc = qc.rewrite(reader);
+        if (bclauses[i].getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
+          numNegatives++;
+        }
+
+        if (qc instanceof BooleanQuery) {
+          ArrayList sc = new ArrayList();
+          addComplexPhraseClause(sc, (BooleanQuery) qc);
+          if (sc.size() > 0) {
+            allSpanClauses[i] = (SpanQuery) sc.get(0);
+          } else {
+            // Insert fake term e.g. phrase query was for "Fred Smithe*" and
+            // there were no "Smithe*" terms - need to
+            // prevent match on just "Fred".
+            allSpanClauses[i] = new SpanTermQuery(new Term(field,
+                "Dummy clause because no terms found - must match nothing"));
+          }
+        } else {
+          if (qc instanceof TermQuery) {
+            TermQuery tq = (TermQuery) qc;
+            allSpanClauses[i] = new SpanTermQuery(tq.getTerm());
+          } else {
+            throw new IllegalArgumentException("Unknown query type \""
+                + qc.getClass().getName()
+                + "\" found in phrase query string \""
+                + phrasedQueryStringContents + "\"");
+          }
+
+        }
+      }
+      if (numNegatives == 0) {
+        // The simple case - no negative elements in phrase
+        return new SpanNearQuery(allSpanClauses, slopFactor, true);
+      }
+      // Complex case - we have mixed positives and negatives in the
+      // sequence.
+      // Need to return a SpanNotQuery
+      ArrayList positiveClauses = new ArrayList();
+      for (int j = 0; j < allSpanClauses.length; j++) {
+        if (!bclauses[j].getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
+          positiveClauses.add(allSpanClauses[j]);
+        }
+      }
+
+      SpanQuery[] includeClauses = (SpanQuery[]) positiveClauses
+          .toArray(new SpanQuery[positiveClauses.size()]);
+
+      SpanQuery include = null;
+      if (includeClauses.length == 1) {
+        include = includeClauses[0]; // only one positive clause
+      } else {
+        // need to increase slop factor based on gaps introduced by
+        // negatives
+        include = new SpanNearQuery(includeClauses, slopFactor + numNegatives,
+            true);
+      }
+      // Use sequence of positive and negative values as the exclude.
+      SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor,
+          true);
+      SpanNotQuery snot = new SpanNotQuery(include, exclude);
+      return snot;
+    }
+
+    private void addComplexPhraseClause(List spanClauses, BooleanQuery qc) {
+      ArrayList ors = new ArrayList();
+      ArrayList nots = new ArrayList();
+      BooleanClause[] bclauses = qc.getClauses();
+
+      // For all clauses e.g. one* two~
+      for (int i = 0; i < bclauses.length; i++) {
+        Query childQuery = bclauses[i].getQuery();
+
+        // select the list to which we will add these options
+        ArrayList chosenList = ors;
+        if (bclauses[i].getOccur() == BooleanClause.Occur.MUST_NOT) {
+          chosenList = nots;
+        }
+
+        if (childQuery instanceof TermQuery) {
+          TermQuery tq = (TermQuery) childQuery;
+          SpanTermQuery stq = new SpanTermQuery(tq.getTerm());
+          stq.setBoost(tq.getBoost());
+          chosenList.add(stq);
+        } else if (childQuery instanceof BooleanQuery) {
+          BooleanQuery cbq = (BooleanQuery) childQuery;
+          addComplexPhraseClause(chosenList, cbq);
+        } else {
+          // TODO alternatively could call extract terms here?
+          throw new IllegalArgumentException("Unknown query type:"
+              + childQuery.getClass().getName());
+        }
+      }
+      if (ors.size() == 0) {
+        return;
+      }
+      SpanOrQuery soq = new SpanOrQuery((SpanQuery[]) ors
+          .toArray(new SpanQuery[ors.size()]));
+      if (nots.size() == 0) {
+        spanClauses.add(soq);
+      } else {
+        SpanOrQuery snqs = new SpanOrQuery((SpanQuery[]) nots
+            .toArray(new SpanQuery[nots.size()]));
+        SpanNotQuery snq = new SpanNotQuery(soq, snqs);
+        spanClauses.add(snq);
+      }
+    }
+
+    public String toString(String field) {
+      return "\"" + phrasedQueryStringContents + "\"";
+    }
+
+    public int hashCode() {
+      final int prime = 31;
+      int result = 1;
+      result = prime * result + ((field == null) ? 0 : field.hashCode());
+      result = prime
+          * result
+          + ((phrasedQueryStringContents == null) ? 0
+              : phrasedQueryStringContents.hashCode());
+      result = prime * result + slopFactor;
+      return result;
+    }
+
+    public boolean equals(Object obj) {
+      if (this == obj)
+        return true;
+      if (obj == null)
+        return false;
+      if (getClass() != obj.getClass())
+        return false;
+      ComplexPhraseQuery other = (ComplexPhraseQuery) obj;
+      if (field == null) {
+        if (other.field != null)
+          return false;
+      } else if (!field.equals(other.field))
+        return false;
+      if (phrasedQueryStringContents == null) {
+        if (other.phrasedQueryStringContents != null)
+          return false;
+      } else if (!phrasedQueryStringContents
+          .equals(other.phrasedQueryStringContents))
+        return false;
+      if (slopFactor != other.slopFactor)
+        return false;
+      return true;
+    }
+  }
+}
--- a/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java
+++ b/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java
@ -0,0 +1,143 @@
+package org.apache.lucene.queryParser;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashSet;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestComplexPhraseQuery extends TestCase {
+
+  Analyzer analyzer = new StandardAnalyzer();
+
+  DocData docsContent[] = { new DocData("john smith", "1"),
+      new DocData("johathon smith", "2"),
+      new DocData("john percival smith", "3"),
+      new DocData("jackson waits tom", "4") };
+
+  private IndexSearcher searcher;
+
+  String defaultFieldName = "name";
+
+  public void testComplexPhrases() throws Exception {
+    checkMatches("\"john smith\"", "1"); // Simple multi-term still works
+    checkMatches("\"j*   smyth~\"", "1,2"); // wildcards and fuzzies are OK in
+    // phrases
+    checkMatches("\"(jo* -john)  smith\"", "2"); // boolean logic works
+    checkMatches("\"jo*  smith\"~2", "1,2,3"); // position logic works.
+    checkMatches("\"jo* [sma TO smZ]\" ", "1,2"); // range queries supported
+    checkMatches("\"john\"", "1,3"); // Simple single-term still works
+    checkMatches("\"(john OR johathon)  smith\"", "1,2"); // boolean logic with
+    // brackets works.
+    checkMatches("\"(jo* -john) smyth~\"", "2"); // boolean logic with
+    // brackets works.
+
+    // checkMatches("\"john -percival\"", "1"); // not logic doesn't work
+    // currently :(.
+
+    checkMatches("\"john  nosuchword*\"", ""); // phrases with clauses producing
+    // empty sets
+
+    checkBadQuery("\"jo*  id:1 smith\""); // mixing fields in a phrase is bad
+    checkBadQuery("\"jo* \"smith\" \""); // phrases inside phrases is bad
+  }
+
+  private void checkBadQuery(String qString) {
+    QueryParser qp = new ComplexPhraseQueryParser(defaultFieldName, analyzer);
+    Throwable expected = null;
+    try {
+      qp.parse(qString);
+    } catch (Throwable e) {
+      expected = e;
+    }
+    assertNotNull("Expected parse error in " + qString, expected);
+
+  }
+
+  private void checkMatches(String qString, String expectedVals)
+      throws Exception {
+    QueryParser qp = new ComplexPhraseQueryParser(defaultFieldName, analyzer);
+    qp.setFuzzyPrefixLength(1); // usually a good idea
+
+    Query q = qp.parse(qString);
+
+    HashSet expecteds = new HashSet();
+    String[] vals = expectedVals.split(",");
+    for (int i = 0; i < vals.length; i++) {
+      if (vals[i].length() > 0)
+        expecteds.add(vals[i]);
+    }
+
+    TopDocs td = searcher.search(q, 10);
+    ScoreDoc[] sd = td.scoreDocs;
+    for (int i = 0; i < sd.length; i++) {
+      Document doc = searcher.doc(sd[i].doc);
+      String id = doc.get("id");
+      assertTrue(qString + "matched doc#" + id + " not expected", expecteds
+          .contains(id));
+      expecteds.remove(id);
+    }
+
+    assertEquals(qString + " missing some matches ", 0, expecteds.size());
+
+  }
+
+  protected void setUp() throws Exception {
+    RAMDirectory rd = new RAMDirectory();
+    IndexWriter w = new IndexWriter(rd, analyzer, MaxFieldLength.UNLIMITED);
+    for (int i = 0; i < docsContent.length; i++) {
+      Document doc = new Document();
+      doc.add(new Field("name", docsContent[i].name, Field.Store.YES,
+          Field.Index.ANALYZED));
+      doc.add(new Field("id", docsContent[i].id, Field.Store.YES,
+          Field.Index.ANALYZED));
+      w.addDocument(doc);
+    }
+    w.close();
+    searcher = new IndexSearcher(rd);
+  }
+
+  protected void tearDown() throws Exception {
+    searcher.close();
+  }
+
+  static class DocData {
+    String name;
+
+    String id;
+
+    public DocData(String name, String id) {
+      super();
+      this.name = name;
+      this.id = id;
+    }
+  }
+
+}