From f7fa57997191026f3bef51cb3c13edd031b5d781 Mon Sep 17 00:00:00 2001 From: Mark Harwood Date: Mon, 6 Jul 2009 19:32:54 +0000 Subject: [PATCH] Initial commit of LUCENE-1486 - a subclass of the default QueryParser that overrides the parsing of PhraseQueries to allow more complex syntax e.g. wildcards in phrase queries git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791579 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 6 + .../queryParser/ComplexPhraseQueryParser.java | 386 ++++++++++++++++++ .../queryParser/TestComplexPhraseQuery.java | 143 +++++++ 3 files changed, 535 insertions(+) create mode 100644 src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java create mode 100644 src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java diff --git a/CHANGES.txt b/CHANGES.txt index 214fc669f4d..b3d7a109f6b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -521,6 +521,12 @@ New features All standard parsers now also implement Serializable and enforce their singleton status. (Uwe Schindler, Mike McCandless) +31. LUCENE-1486: Added ComplexPhraseQueryParser, an extension of QueryParser + that allows a subset of the Lucene query language to be embedded in + PhraseQuerys. Wildcard, Range, and Fuzzy queries, as well as limited + boolean logic, can be used within quote operators with this parser, ie: + "(jo* -john) smyth~". (Mark Harwood via Mark Miller) + Optimizations 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing diff --git a/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java b/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java new file mode 100644 index 00000000000..441c95f346f --- /dev/null +++ b/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java @@ -0,0 +1,386 @@ +package org.apache.lucene.queryParser; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; + +/** + * QueryParser which permits complex phrase query syntax e.g. "(john jon + * jonathan~) peters*" + * + * Performs potentially multiple passes over Query text to parse any nested + * logic in PhraseQueries. - First pass takes any PhraseQuery content between + * quotes and stores for subsequent pass. All other query content is parsed as + * normal - Second pass parses any stored PhraseQuery content, checking all + * embedded clauses are referring to the same field and therefore can be + * rewritten as Span queries. All PhraseQuery clauses are expressed as + * ComplexPhraseQuery objects + * + * This could arguably be done in one pass using a new QueryParser but here I am + * working within the constraints of the existing parser as a base class. This + * currently simply feeds all phrase content through an analyzer to select + * phrase terms - any "special" syntax such as * ~ * etc are not given special + * status + * + * + */ +public class ComplexPhraseQueryParser extends QueryParser { + private ArrayList/**/complexPhrases = null; + + private boolean isPass2ResolvingPhrases; + + private ComplexPhraseQuery currentPhraseQuery = null; + + public ComplexPhraseQueryParser(String f, Analyzer a) { + super(f, a); + } + + protected Query getFieldQuery(String field, String queryText, int slop) { + ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop); + complexPhrases.add(cpq); // add to list of phrases to be parsed once + // we + // are through with this pass + return cpq; + } + + public Query parse(String query) throws ParseException { + if (isPass2ResolvingPhrases) { + boolean oldConstantScoreRewriteSetting = getConstantScoreRewrite(); + try { + // Temporarily set constantScoreRewrite to false so that Parser will + // generate visible + // collection of terms which we can convert into SpanQueries. + // ConstantScoreRewrite mode produces an + // opaque ConstantScoreQuery object which cannot be interrogated for + // terms in the same way a BooleanQuery can. + // QueryParser is not guaranteed threadsafe anyway so this temporary + // state change should not + // present an issue + setConstantScoreRewrite(false); + return super.parse(query); + } finally { + setConstantScoreRewrite(oldConstantScoreRewriteSetting); + } + } + + // First pass - parse the top-level query recording any PhraseQuerys + // which will need to be resolved + complexPhrases = new ArrayList/**/(); + Query q = super.parse(query); + + // Perform second pass, using this QueryParser to parse any nested + // PhraseQueries with different + // set of syntax restrictions (i.e. all fields must be same) + isPass2ResolvingPhrases = true; + try { + for (Iterator iterator = complexPhrases.iterator(); iterator.hasNext();) { + currentPhraseQuery = (ComplexPhraseQuery) iterator.next(); + // in each phrase, now parse the contents between quotes as a + // separate parse operation + currentPhraseQuery.parsePhraseElements(this); + } + } finally { + isPass2ResolvingPhrases = false; + } + return q; + } + + // There is No "getTermQuery throws ParseException" method to override so + // unfortunately need + // to throw a runtime exception here if a term for another field is embedded + // in phrase query + protected Query newTermQuery(Term term) { + if (isPass2ResolvingPhrases) { + try { + checkPhraseClauseIsForSameField(term.field()); + } catch (ParseException pe) { + throw new RuntimeException("Error parsing complex phrase", pe); + } + } + return super.newTermQuery(term); + } + + // Helper method used to report on any clauses that appear in query syntax + private void checkPhraseClauseIsForSameField(String field) + throws ParseException { + if (!field.equals(currentPhraseQuery.field)) { + throw new ParseException("Cannot have clause for field \"" + field + + "\" nested in phrase " + " for field \"" + currentPhraseQuery.field + + "\""); + } + } + + protected Query getWildcardQuery(String field, String termStr) + throws ParseException { + if (isPass2ResolvingPhrases) { + checkPhraseClauseIsForSameField(field); + } + return super.getWildcardQuery(field, termStr); + } + + protected Query getRangeQuery(String field, String part1, String part2, + boolean inclusive) throws ParseException { + if (isPass2ResolvingPhrases) { + checkPhraseClauseIsForSameField(field); + } + return super.getRangeQuery(field, part1, part2, inclusive); + } + + protected Query newRangeQuery(String field, String part1, String part2, + boolean inclusive) { + if (isPass2ResolvingPhrases) { + // Must use old-style RangeQuery in order to produce a BooleanQuery + // that can be turned into SpanOr clause + TermRangeQuery rangeQuery = new TermRangeQuery(field, part1, part2, inclusive, inclusive, + getRangeCollator()); + rangeQuery.setConstantScoreRewrite(false);; + return rangeQuery; + } + return super.newRangeQuery(field, part1, part2, inclusive); + } + + protected Query getFuzzyQuery(String field, String termStr, + float minSimilarity) throws ParseException { + if (isPass2ResolvingPhrases) { + checkPhraseClauseIsForSameField(field); + } + return super.getFuzzyQuery(field, termStr, minSimilarity); + } + + /* + * Used to handle the query content in between quotes and produced Span-based + * interpretations of the clauses. + */ + static class ComplexPhraseQuery extends Query { + + String field; + + String phrasedQueryStringContents; + + int slopFactor; + + private Query contents; + + public ComplexPhraseQuery(String field, String phrasedQueryStringContents, + int slopFactor) { + super(); + this.field = field; + this.phrasedQueryStringContents = phrasedQueryStringContents; + this.slopFactor = slopFactor; + } + + // Called by ComplexPhraseQueryParser for each phrase after the main + // parse + // thread is through + protected void parsePhraseElements(QueryParser qp) throws ParseException { + // TODO ensure that field-sensitivity is preserved ie the query + // string below is parsed as + // field+":("+phrasedQueryStringContents+")" + // but this will need code in rewrite to unwrap the first layer of + // boolean query + contents = qp.parse(phrasedQueryStringContents); + } + + public Query rewrite(IndexReader reader) throws IOException { + // ArrayList spanClauses = new ArrayList(); + if (contents instanceof TermQuery) { + return contents; + } + // Build a sequence of Span clauses arranged in a SpanNear - child + // clauses can be complex + // Booleans e.g. nots and ors etc + int numNegatives = 0; + if (!(contents instanceof BooleanQuery)) { + throw new IllegalArgumentException("Unknown query type \"" + + contents.getClass().getName() + + "\" found in phrase query string \"" + phrasedQueryStringContents + + "\""); + } + BooleanQuery bq = (BooleanQuery) contents; + BooleanClause[] bclauses = bq.getClauses(); + SpanQuery[] allSpanClauses = new SpanQuery[bclauses.length]; + // For all clauses e.g. one* two~ + for (int i = 0; i < bclauses.length; i++) { + // HashSet bclauseterms=new HashSet(); + Query qc = bclauses[i].getQuery(); + // Rewrite this clause e.g one* becomes (one OR onerous) + qc = qc.rewrite(reader); + if (bclauses[i].getOccur().equals(BooleanClause.Occur.MUST_NOT)) { + numNegatives++; + } + + if (qc instanceof BooleanQuery) { + ArrayList sc = new ArrayList(); + addComplexPhraseClause(sc, (BooleanQuery) qc); + if (sc.size() > 0) { + allSpanClauses[i] = (SpanQuery) sc.get(0); + } else { + // Insert fake term e.g. phrase query was for "Fred Smithe*" and + // there were no "Smithe*" terms - need to + // prevent match on just "Fred". + allSpanClauses[i] = new SpanTermQuery(new Term(field, + "Dummy clause because no terms found - must match nothing")); + } + } else { + if (qc instanceof TermQuery) { + TermQuery tq = (TermQuery) qc; + allSpanClauses[i] = new SpanTermQuery(tq.getTerm()); + } else { + throw new IllegalArgumentException("Unknown query type \"" + + qc.getClass().getName() + + "\" found in phrase query string \"" + + phrasedQueryStringContents + "\""); + } + + } + } + if (numNegatives == 0) { + // The simple case - no negative elements in phrase + return new SpanNearQuery(allSpanClauses, slopFactor, true); + } + // Complex case - we have mixed positives and negatives in the + // sequence. + // Need to return a SpanNotQuery + ArrayList positiveClauses = new ArrayList(); + for (int j = 0; j < allSpanClauses.length; j++) { + if (!bclauses[j].getOccur().equals(BooleanClause.Occur.MUST_NOT)) { + positiveClauses.add(allSpanClauses[j]); + } + } + + SpanQuery[] includeClauses = (SpanQuery[]) positiveClauses + .toArray(new SpanQuery[positiveClauses.size()]); + + SpanQuery include = null; + if (includeClauses.length == 1) { + include = includeClauses[0]; // only one positive clause + } else { + // need to increase slop factor based on gaps introduced by + // negatives + include = new SpanNearQuery(includeClauses, slopFactor + numNegatives, + true); + } + // Use sequence of positive and negative values as the exclude. + SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor, + true); + SpanNotQuery snot = new SpanNotQuery(include, exclude); + return snot; + } + + private void addComplexPhraseClause(List spanClauses, BooleanQuery qc) { + ArrayList ors = new ArrayList(); + ArrayList nots = new ArrayList(); + BooleanClause[] bclauses = qc.getClauses(); + + // For all clauses e.g. one* two~ + for (int i = 0; i < bclauses.length; i++) { + Query childQuery = bclauses[i].getQuery(); + + // select the list to which we will add these options + ArrayList chosenList = ors; + if (bclauses[i].getOccur() == BooleanClause.Occur.MUST_NOT) { + chosenList = nots; + } + + if (childQuery instanceof TermQuery) { + TermQuery tq = (TermQuery) childQuery; + SpanTermQuery stq = new SpanTermQuery(tq.getTerm()); + stq.setBoost(tq.getBoost()); + chosenList.add(stq); + } else if (childQuery instanceof BooleanQuery) { + BooleanQuery cbq = (BooleanQuery) childQuery; + addComplexPhraseClause(chosenList, cbq); + } else { + // TODO alternatively could call extract terms here? + throw new IllegalArgumentException("Unknown query type:" + + childQuery.getClass().getName()); + } + } + if (ors.size() == 0) { + return; + } + SpanOrQuery soq = new SpanOrQuery((SpanQuery[]) ors + .toArray(new SpanQuery[ors.size()])); + if (nots.size() == 0) { + spanClauses.add(soq); + } else { + SpanOrQuery snqs = new SpanOrQuery((SpanQuery[]) nots + .toArray(new SpanQuery[nots.size()])); + SpanNotQuery snq = new SpanNotQuery(soq, snqs); + spanClauses.add(snq); + } + } + + public String toString(String field) { + return "\"" + phrasedQueryStringContents + "\""; + } + + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((field == null) ? 0 : field.hashCode()); + result = prime + * result + + ((phrasedQueryStringContents == null) ? 0 + : phrasedQueryStringContents.hashCode()); + result = prime * result + slopFactor; + return result; + } + + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + ComplexPhraseQuery other = (ComplexPhraseQuery) obj; + if (field == null) { + if (other.field != null) + return false; + } else if (!field.equals(other.field)) + return false; + if (phrasedQueryStringContents == null) { + if (other.phrasedQueryStringContents != null) + return false; + } else if (!phrasedQueryStringContents + .equals(other.phrasedQueryStringContents)) + return false; + if (slopFactor != other.slopFactor) + return false; + return true; + } + } +} diff --git a/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java b/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java new file mode 100644 index 00000000000..b112edd75a4 --- /dev/null +++ b/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java @@ -0,0 +1,143 @@ +package org.apache.lucene.queryParser; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashSet; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.RAMDirectory; + +public class TestComplexPhraseQuery extends TestCase { + + Analyzer analyzer = new StandardAnalyzer(); + + DocData docsContent[] = { new DocData("john smith", "1"), + new DocData("johathon smith", "2"), + new DocData("john percival smith", "3"), + new DocData("jackson waits tom", "4") }; + + private IndexSearcher searcher; + + String defaultFieldName = "name"; + + public void testComplexPhrases() throws Exception { + checkMatches("\"john smith\"", "1"); // Simple multi-term still works + checkMatches("\"j* smyth~\"", "1,2"); // wildcards and fuzzies are OK in + // phrases + checkMatches("\"(jo* -john) smith\"", "2"); // boolean logic works + checkMatches("\"jo* smith\"~2", "1,2,3"); // position logic works. + checkMatches("\"jo* [sma TO smZ]\" ", "1,2"); // range queries supported + checkMatches("\"john\"", "1,3"); // Simple single-term still works + checkMatches("\"(john OR johathon) smith\"", "1,2"); // boolean logic with + // brackets works. + checkMatches("\"(jo* -john) smyth~\"", "2"); // boolean logic with + // brackets works. + + // checkMatches("\"john -percival\"", "1"); // not logic doesn't work + // currently :(. + + checkMatches("\"john nosuchword*\"", ""); // phrases with clauses producing + // empty sets + + checkBadQuery("\"jo* id:1 smith\""); // mixing fields in a phrase is bad + checkBadQuery("\"jo* \"smith\" \""); // phrases inside phrases is bad + } + + private void checkBadQuery(String qString) { + QueryParser qp = new ComplexPhraseQueryParser(defaultFieldName, analyzer); + Throwable expected = null; + try { + qp.parse(qString); + } catch (Throwable e) { + expected = e; + } + assertNotNull("Expected parse error in " + qString, expected); + + } + + private void checkMatches(String qString, String expectedVals) + throws Exception { + QueryParser qp = new ComplexPhraseQueryParser(defaultFieldName, analyzer); + qp.setFuzzyPrefixLength(1); // usually a good idea + + Query q = qp.parse(qString); + + HashSet expecteds = new HashSet(); + String[] vals = expectedVals.split(","); + for (int i = 0; i < vals.length; i++) { + if (vals[i].length() > 0) + expecteds.add(vals[i]); + } + + TopDocs td = searcher.search(q, 10); + ScoreDoc[] sd = td.scoreDocs; + for (int i = 0; i < sd.length; i++) { + Document doc = searcher.doc(sd[i].doc); + String id = doc.get("id"); + assertTrue(qString + "matched doc#" + id + " not expected", expecteds + .contains(id)); + expecteds.remove(id); + } + + assertEquals(qString + " missing some matches ", 0, expecteds.size()); + + } + + protected void setUp() throws Exception { + RAMDirectory rd = new RAMDirectory(); + IndexWriter w = new IndexWriter(rd, analyzer, MaxFieldLength.UNLIMITED); + for (int i = 0; i < docsContent.length; i++) { + Document doc = new Document(); + doc.add(new Field("name", docsContent[i].name, Field.Store.YES, + Field.Index.ANALYZED)); + doc.add(new Field("id", docsContent[i].id, Field.Store.YES, + Field.Index.ANALYZED)); + w.addDocument(doc); + } + w.close(); + searcher = new IndexSearcher(rd); + } + + protected void tearDown() throws Exception { + searcher.close(); + } + + static class DocData { + String name; + + String id; + + public DocData(String name, String id) { + super(); + this.name = name; + this.id = id; + } + } + +}