a query parser by Ronnie Kolehmainen that also sends PrefixQuerys etc. through the analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@231523 13f79535-47bb-0310-9956-ffa450edef68
2005-08-11 21:28:58 +00:00 · 2005-08-11 21:28:58 +00:00 · dd5c74112f
parent ff98018b08
commit dd5c74112f
2 changed files with 432 additions and 0 deletions
--- a/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java
+++ b/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java
@ -0,0 +1,314 @@
+package org.apache.lucene.queryParser.analyzing;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.Query;
+
+/**
+ * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
+ * are also passed through the given analyzer, but <code>?</code> and <code>*</code> don't get 
+ * removed from the search terms.
+ * 
+ * <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
+ * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer 
+ * will turn <code>H&auml;user</code> into <code>hau</code>, but <code>H?user</code> will 
+ * become <code>h?user</code> when using this parser and thus no match would be found (i.e.
+ * using this parser will be no improvement over QueryParser in such cases). 
+ *
+ * @author  Ronnie Kolehmainen (ronnie.kolehmainen at ub.uu.se)
+ * @version $Revision$, $Date$
+ */
+public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {
+
+  /**
+   * Constructs a query parser.
+   * @param field    the default field for query terms.
+   * @param analyzer used to find terms in the query text.
+   */
+  public AnalyzingQueryParser(String field, Analyzer analyzer) {
+    super(field, analyzer);
+  }
+
+  /**
+   * Called when parser
+   * parses an input term token that contains one or more wildcard
+   * characters (? and *), but is not a prefix term token (one
+   * that has just a single * character at the end)
+   * <p>
+   * Depending on analyzer and settings, a wildcard term may (most probably will)
+   * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
+   * <p>
+   * Overrides super class, by passing terms through analyzer.
+   *
+   * @param  field   Name of the field query will use.
+   * @param  termStr Term token that contains one or more wild card
+   *                 characters (? or *), but is not simple prefix term
+   *
+   * @return Resulting {@link Query} built for the term
+   * @throws ParseException
+   */
+  protected Query getWildcardQuery(String field, String termStr) throws ParseException {
+    List tlist = new ArrayList();
+    List wlist = new ArrayList();
+    /* somewhat a hack: find/store wildcard chars
+     * in order to put them back after analyzing */
+    boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
+    StringBuffer tmpBuffer = new StringBuffer();
+    char[] chars = termStr.toCharArray();
+    for (int i = 0; i < termStr.length(); i++) {
+      if (chars[i] == '?' || chars[i] == '*') {
+        if (isWithinToken) {
+          tlist.add(tmpBuffer.toString());
+          tmpBuffer.setLength(0);
+        }
+        isWithinToken = false;
+      } else {
+        if (!isWithinToken) {
+          wlist.add(tmpBuffer.toString());
+          tmpBuffer.setLength(0);
+        }
+        isWithinToken = true;
+      }
+      tmpBuffer.append(chars[i]);
+    }
+    if (isWithinToken) {
+      tlist.add(tmpBuffer.toString());
+    } else {
+      wlist.add(tmpBuffer.toString());
+    }
+
+    // get Analyzer from superclass and tokenize the term
+    TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
+    org.apache.lucene.analysis.Token t;
+
+    int countTokens = 0;
+    while (true) {
+      try {
+        t = source.next();
+      } catch (IOException e) {
+        t = null;
+      }
+      if (t == null) {
+        break;
+      }
+      if (!"".equals(t.termText())) {
+        try {
+          tlist.set(countTokens++, t.termText());
+        } catch (IndexOutOfBoundsException ioobe) {
+          countTokens = -1;
+        }
+      }
+    }
+    try {
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+
+    if (countTokens != tlist.size()) {
+      /* this means that the analyzer used either added or consumed 
+       * (common for a stemmer) tokens, and we can't build a WildcardQuery */
+      throw new ParseException("Cannot build WildcardQuery with analyzer "
+          + getAnalyzer().getClass() + " - tokens added or lost");
+    }
+
+    if (tlist.size() == 0) {
+      return null;
+    } else if (tlist.size() == 1) {
+      if (wlist != null && wlist.size() == 1) {
+        /* if wlist contains one wildcard, it must be at the end, because:
+         * 1) wildcards are not allowed in 1st position of a term by QueryParser
+         * 2) if wildcard was *not* in end, there would be *two* or more tokens */
+        return super.getWildcardQuery(field, (String) tlist.get(0)
+            + (((String) wlist.get(0)).toString()));
+      } else {
+        /* we should never get here! if so, this method was called
+         * with a termStr containing no wildcard ... */
+        throw new IllegalArgumentException("getWildcardQuery called without wildcard");
+      }
+    } else {
+      /* the term was tokenized, let's rebuild to one token
+       * with wildcards put back in postion */
+      StringBuffer sb = new StringBuffer();
+      for (int i = 0; i < tlist.size(); i++) {
+        sb.append((String) tlist.get(i));
+        if (wlist != null && wlist.size() > i) {
+          sb.append((String) wlist.get(i));
+        }
+      }
+      return super.getWildcardQuery(field, sb.toString());
+    }
+  }
+
+  /**
+   * Called when parser parses an input term
+   * token that uses prefix notation; that is, contains a single '*' wildcard
+   * character as its last character. Since this is a special case
+   * of generic wildcard term, and such a query can be optimized easily,
+   * this usually results in a different query object.
+   * <p>
+   * Depending on analyzer and settings, a prefix term may (most probably will)
+   * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
+   * <p>
+   * Overrides super class, by passing terms through analyzer.
+   *
+   * @param  field   Name of the field query will use.
+   * @param  termStr Term token to use for building term for the query
+   *                 (<b>without</b> trailing '*' character!)
+   *
+   * @return Resulting {@link Query} built for the term
+   * @throws ParseException
+   */
+  protected Query getPrefixQuery(String field, String termStr) throws ParseException {
+    // get Analyzer from superclass and tokenize the term
+    TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
+    List tlist = new ArrayList();
+    org.apache.lucene.analysis.Token t;
+
+    while (true) {
+      try {
+        t = source.next();
+      } catch (IOException e) {
+        t = null;
+      }
+      if (t == null) {
+        break;
+      }
+      tlist.add(t.termText());
+    }
+
+    try {
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+
+    if (tlist.size() == 1) {
+      return super.getPrefixQuery(field, (String) tlist.get(0));
+    } else {
+      /* this means that the analyzer used consumed the only token we had,
+       * and we can't build a PrefixQuery */
+      throw new ParseException("Cannot build PrefixQuery with analyzer "
+          + getAnalyzer().getClass() + " - token was consumed");
+    }
+  }
+
+  /**
+   * Called when parser parses an input term token that has the fuzzy suffix (~) appended.
+   * <p>
+   * Depending on analyzer and settings, a fuzzy term may (most probably will)
+   * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
+   * <p>
+   * Overrides super class, by passing terms through analyzer.
+   *
+   * @param field Name of the field query will use.
+   * @param termStr Term token to use for building term for the query
+   *
+   * @return Resulting {@link Query} built for the term
+   * @exception ParseException
+   */
+  protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
+      throws ParseException {
+    // get Analyzer from superclass and tokenize the term
+    TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
+    org.apache.lucene.analysis.Token t;
+    boolean multipleTokens = false;
+
+    try {
+      t = source.next();
+      multipleTokens = source.next() != null;
+    } catch (IOException e) {
+      t = null;
+    }
+
+    try {
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+
+    if (multipleTokens) {
+      throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
+          + " - tokens were added");
+    }
+
+    return (t == null) ? null : super.getFuzzyQuery(field, t.termText(), minSimilarity);
+  }
+
+  /**
+   * Overrides super class, by passing terms through analyzer.
+   * @exception ParseException
+   */
+  protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
+      throws ParseException {
+    // get Analyzer from superclass and tokenize the terms
+    TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
+    org.apache.lucene.analysis.Token t;
+    boolean multipleTokens = false;
+
+    // part1
+    try {
+      t = source.next();
+      if (t != null) {
+        part1 = t.termText();
+      }
+      multipleTokens = source.next() != null;
+    } catch (IOException e) {
+      t = null;
+    }
+    try {
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+    if (multipleTokens) {
+      throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+          + " - tokens were added to part1");
+    }
+
+    source = getAnalyzer().tokenStream(field, new StringReader(part2));
+    // part2
+    try {
+      t = source.next();
+      if (t != null) {
+        part2 = t.termText();
+      }
+      multipleTokens = source.next() != null;
+    } catch (IOException e) {
+      t = null;
+    }
+    try {
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+    if (multipleTokens) {
+      throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+          + " - tokens were added to part2");
+    }
+    return super.getRangeQuery(field, part1, part2, inclusive);
+  }
+
+}
--- a/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java
+++ b/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java
@ -0,0 +1,118 @@
+package org.apache.lucene.queryParser.analyzing;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ISOLatin1AccentFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.queryParser.ParseException;
+
+/**
+ * @author  Ronnie Kolehmainen (ronnie.kolehmainen at ub.uu.se)
+ * @version $Revision$, $Date$
+ */
+public class TestAnalyzingQueryParser extends TestCase {
+
+  private Analyzer a;
+
+  private String[] wildcardInput;
+  private String[] wildcardExpected;
+  private String[] prefixInput;
+  private String[] prefixExpected;
+  private String[] rangeInput;
+  private String[] rangeExpected;
+  private String[] fuzzyInput;
+  private String[] fuzzyExpected;
+
+  public void setUp() {
+    wildcardInput = new String[] { "übersetzung über*ung",
+        "Mötley Cr\u00fce Mötl?* Crü?", "Renée Zellweger Ren?? Zellw?ger" };
+    wildcardExpected = new String[] { "ubersetzung uber*ung", "motley crue motl?* cru?",
+        "renee zellweger ren?? zellw?ger" };
+
+    prefixInput = new String[] { "übersetzung übersetz*",
+        "Mötley Crüe Mötl* crü*", "René? Zellw*" };
+    prefixExpected = new String[] { "ubersetzung ubersetz*", "motley crue motl* cru*",
+        "rene? zellw*" };
+
+    rangeInput = new String[] { "[aa TO bb]", "{Anaïs TO Zoé}" };
+    rangeExpected = new String[] { "[aa TO bb]", "{anais TO zoe}" };
+
+    fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9",
+        "Mötley Crüe Mötley~0.75 Crüe~0.5",
+        "Renée Zellweger Renée~0.9 Zellweger~" };
+    fuzzyExpected = new String[] { "ubersetzung ubersetzung~0.9",
+        "motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~0.5" };
+
+    a = new ASCIIAnalyzer();
+  }
+
+  public void testWildCardQuery() throws ParseException {
+    for (int i = 0; i < wildcardInput.length; i++) {
+      assertEquals("Testing wildcards with analyzer " + a.getClass() + ", input string: "
+          + wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a));
+    }
+  }
+
+  public void testPrefixQuery() throws ParseException {
+    for (int i = 0; i < prefixInput.length; i++) {
+      assertEquals("Testing prefixes with analyzer " + a.getClass() + ", input string: "
+          + prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a));
+    }
+  }
+
+  public void testRangeQuery() throws ParseException {
+    for (int i = 0; i < rangeInput.length; i++) {
+      assertEquals("Testing ranges with analyzer " + a.getClass() + ", input string: "
+          + rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a));
+    }
+  }
+
+  public void testFuzzyQuery() throws ParseException {
+    for (int i = 0; i < fuzzyInput.length; i++) {
+      assertEquals("Testing fuzzys with analyzer " + a.getClass() + ", input string: "
+          + fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a));
+    }
+  }
+
+  private String parseWithAnalyzingQueryParser(String s, Analyzer a) throws ParseException {
+    AnalyzingQueryParser qp = new AnalyzingQueryParser("field", a);
+    org.apache.lucene.search.Query q = qp.parse(s);
+    return q.toString("field");
+  }
+
+}
+
+class ASCIIAnalyzer extends org.apache.lucene.analysis.Analyzer {
+  public ASCIIAnalyzer() {
+  }
+
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new StandardTokenizer(reader);
+    result = new StandardFilter(result);
+    result = new ISOLatin1AccentFilter(result);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
+}