From dd5c74112f59ec697f2d08486f00e7c6489fd1f9 Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Thu, 11 Aug 2005 21:28:58 +0000 Subject: [PATCH] a query parser by Ronnie Kolehmainen that also sends PrefixQuerys etc. through the analyzer git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@231523 13f79535-47bb-0310-9956-ffa450edef68 --- .../analyzing/AnalyzingQueryParser.java | 314 ++++++++++++++++++ .../analyzing/TestAnalyzingQueryParser.java | 118 +++++++ 2 files changed, 432 insertions(+) create mode 100644 contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java create mode 100644 contrib/miscellaneous/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java diff --git a/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java b/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java new file mode 100644 index 00000000000..2f4fc302104 --- /dev/null +++ b/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java @@ -0,0 +1,314 @@ +package org.apache.lucene.queryParser.analyzing; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.Query; + +/** + * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys + * are also passed through the given analyzer, but ? and * don't get + * removed from the search terms. + * + *

Warning: This class should only be used with analyzers that do not use stopwords + * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer + * will turn Häuser into hau, but H?user will + * become h?user when using this parser and thus no match would be found (i.e. + * using this parser will be no improvement over QueryParser in such cases). + * + * @author Ronnie Kolehmainen (ronnie.kolehmainen at ub.uu.se) + * @version $Revision$, $Date$ + */ +public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser { + + /** + * Constructs a query parser. + * @param field the default field for query terms. + * @param analyzer used to find terms in the query text. + */ + public AnalyzingQueryParser(String field, Analyzer analyzer) { + super(field, analyzer); + } + + /** + * Called when parser + * parses an input term token that contains one or more wildcard + * characters (? and *), but is not a prefix term token (one + * that has just a single * character at the end) + *

+ * Depending on analyzer and settings, a wildcard term may (most probably will) + * be lower-cased automatically. It will go through the default Analyzer. + *

+ * Overrides super class, by passing terms through analyzer. + * + * @param field Name of the field query will use. + * @param termStr Term token that contains one or more wild card + * characters (? or *), but is not simple prefix term + * + * @return Resulting {@link Query} built for the term + * @throws ParseException + */ + protected Query getWildcardQuery(String field, String termStr) throws ParseException { + List tlist = new ArrayList(); + List wlist = new ArrayList(); + /* somewhat a hack: find/store wildcard chars + * in order to put them back after analyzing */ + boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*")); + StringBuffer tmpBuffer = new StringBuffer(); + char[] chars = termStr.toCharArray(); + for (int i = 0; i < termStr.length(); i++) { + if (chars[i] == '?' || chars[i] == '*') { + if (isWithinToken) { + tlist.add(tmpBuffer.toString()); + tmpBuffer.setLength(0); + } + isWithinToken = false; + } else { + if (!isWithinToken) { + wlist.add(tmpBuffer.toString()); + tmpBuffer.setLength(0); + } + isWithinToken = true; + } + tmpBuffer.append(chars[i]); + } + if (isWithinToken) { + tlist.add(tmpBuffer.toString()); + } else { + wlist.add(tmpBuffer.toString()); + } + + // get Analyzer from superclass and tokenize the term + TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); + org.apache.lucene.analysis.Token t; + + int countTokens = 0; + while (true) { + try { + t = source.next(); + } catch (IOException e) { + t = null; + } + if (t == null) { + break; + } + if (!"".equals(t.termText())) { + try { + tlist.set(countTokens++, t.termText()); + } catch (IndexOutOfBoundsException ioobe) { + countTokens = -1; + } + } + } + try { + source.close(); + } catch (IOException e) { + // ignore + } + + if (countTokens != tlist.size()) { + /* this means that the analyzer used either added or consumed + * (common for a stemmer) tokens, and we can't build a WildcardQuery */ + throw new ParseException("Cannot build WildcardQuery with analyzer " + + getAnalyzer().getClass() + " - tokens added or lost"); + } + + if (tlist.size() == 0) { + return null; + } else if (tlist.size() == 1) { + if (wlist != null && wlist.size() == 1) { + /* if wlist contains one wildcard, it must be at the end, because: + * 1) wildcards are not allowed in 1st position of a term by QueryParser + * 2) if wildcard was *not* in end, there would be *two* or more tokens */ + return super.getWildcardQuery(field, (String) tlist.get(0) + + (((String) wlist.get(0)).toString())); + } else { + /* we should never get here! if so, this method was called + * with a termStr containing no wildcard ... */ + throw new IllegalArgumentException("getWildcardQuery called without wildcard"); + } + } else { + /* the term was tokenized, let's rebuild to one token + * with wildcards put back in postion */ + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < tlist.size(); i++) { + sb.append((String) tlist.get(i)); + if (wlist != null && wlist.size() > i) { + sb.append((String) wlist.get(i)); + } + } + return super.getWildcardQuery(field, sb.toString()); + } + } + + /** + * Called when parser parses an input term + * token that uses prefix notation; that is, contains a single '*' wildcard + * character as its last character. Since this is a special case + * of generic wildcard term, and such a query can be optimized easily, + * this usually results in a different query object. + *

+ * Depending on analyzer and settings, a prefix term may (most probably will) + * be lower-cased automatically. It will go through the default Analyzer. + *

+ * Overrides super class, by passing terms through analyzer. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * (without trailing '*' character!) + * + * @return Resulting {@link Query} built for the term + * @throws ParseException + */ + protected Query getPrefixQuery(String field, String termStr) throws ParseException { + // get Analyzer from superclass and tokenize the term + TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); + List tlist = new ArrayList(); + org.apache.lucene.analysis.Token t; + + while (true) { + try { + t = source.next(); + } catch (IOException e) { + t = null; + } + if (t == null) { + break; + } + tlist.add(t.termText()); + } + + try { + source.close(); + } catch (IOException e) { + // ignore + } + + if (tlist.size() == 1) { + return super.getPrefixQuery(field, (String) tlist.get(0)); + } else { + /* this means that the analyzer used consumed the only token we had, + * and we can't build a PrefixQuery */ + throw new ParseException("Cannot build PrefixQuery with analyzer " + + getAnalyzer().getClass() + " - token was consumed"); + } + } + + /** + * Called when parser parses an input term token that has the fuzzy suffix (~) appended. + *

+ * Depending on analyzer and settings, a fuzzy term may (most probably will) + * be lower-cased automatically. It will go through the default Analyzer. + *

+ * Overrides super class, by passing terms through analyzer. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * + * @return Resulting {@link Query} built for the term + * @exception ParseException + */ + protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) + throws ParseException { + // get Analyzer from superclass and tokenize the term + TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); + org.apache.lucene.analysis.Token t; + boolean multipleTokens = false; + + try { + t = source.next(); + multipleTokens = source.next() != null; + } catch (IOException e) { + t = null; + } + + try { + source.close(); + } catch (IOException e) { + // ignore + } + + if (multipleTokens) { + throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass() + + " - tokens were added"); + } + + return (t == null) ? null : super.getFuzzyQuery(field, t.termText(), minSimilarity); + } + + /** + * Overrides super class, by passing terms through analyzer. + * @exception ParseException + */ + protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive) + throws ParseException { + // get Analyzer from superclass and tokenize the terms + TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1)); + org.apache.lucene.analysis.Token t; + boolean multipleTokens = false; + + // part1 + try { + t = source.next(); + if (t != null) { + part1 = t.termText(); + } + multipleTokens = source.next() != null; + } catch (IOException e) { + t = null; + } + try { + source.close(); + } catch (IOException e) { + // ignore + } + if (multipleTokens) { + throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass() + + " - tokens were added to part1"); + } + + source = getAnalyzer().tokenStream(field, new StringReader(part2)); + // part2 + try { + t = source.next(); + if (t != null) { + part2 = t.termText(); + } + multipleTokens = source.next() != null; + } catch (IOException e) { + t = null; + } + try { + source.close(); + } catch (IOException e) { + // ignore + } + if (multipleTokens) { + throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass() + + " - tokens were added to part2"); + } + return super.getRangeQuery(field, part1, part2, inclusive); + } + +} diff --git a/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java b/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java new file mode 100644 index 00000000000..26a30683928 --- /dev/null +++ b/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java @@ -0,0 +1,118 @@ +package org.apache.lucene.queryParser.analyzing; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.ISOLatin1AccentFilter; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.queryParser.ParseException; + +/** + * @author Ronnie Kolehmainen (ronnie.kolehmainen at ub.uu.se) + * @version $Revision$, $Date$ + */ +public class TestAnalyzingQueryParser extends TestCase { + + private Analyzer a; + + private String[] wildcardInput; + private String[] wildcardExpected; + private String[] prefixInput; + private String[] prefixExpected; + private String[] rangeInput; + private String[] rangeExpected; + private String[] fuzzyInput; + private String[] fuzzyExpected; + + public void setUp() { + wildcardInput = new String[] { "übersetzung über*ung", + "Mötley Cr\u00fce Mötl?* Crü?", "Renée Zellweger Ren?? Zellw?ger" }; + wildcardExpected = new String[] { "ubersetzung uber*ung", "motley crue motl?* cru?", + "renee zellweger ren?? zellw?ger" }; + + prefixInput = new String[] { "übersetzung übersetz*", + "Mötley Crüe Mötl* crü*", "René? Zellw*" }; + prefixExpected = new String[] { "ubersetzung ubersetz*", "motley crue motl* cru*", + "rene? zellw*" }; + + rangeInput = new String[] { "[aa TO bb]", "{Anaïs TO Zoé}" }; + rangeExpected = new String[] { "[aa TO bb]", "{anais TO zoe}" }; + + fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9", + "Mötley Crüe Mötley~0.75 Crüe~0.5", + "Renée Zellweger Renée~0.9 Zellweger~" }; + fuzzyExpected = new String[] { "ubersetzung ubersetzung~0.9", + "motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~0.5" }; + + a = new ASCIIAnalyzer(); + } + + public void testWildCardQuery() throws ParseException { + for (int i = 0; i < wildcardInput.length; i++) { + assertEquals("Testing wildcards with analyzer " + a.getClass() + ", input string: " + + wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a)); + } + } + + public void testPrefixQuery() throws ParseException { + for (int i = 0; i < prefixInput.length; i++) { + assertEquals("Testing prefixes with analyzer " + a.getClass() + ", input string: " + + prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a)); + } + } + + public void testRangeQuery() throws ParseException { + for (int i = 0; i < rangeInput.length; i++) { + assertEquals("Testing ranges with analyzer " + a.getClass() + ", input string: " + + rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a)); + } + } + + public void testFuzzyQuery() throws ParseException { + for (int i = 0; i < fuzzyInput.length; i++) { + assertEquals("Testing fuzzys with analyzer " + a.getClass() + ", input string: " + + fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a)); + } + } + + private String parseWithAnalyzingQueryParser(String s, Analyzer a) throws ParseException { + AnalyzingQueryParser qp = new AnalyzingQueryParser("field", a); + org.apache.lucene.search.Query q = qp.parse(s); + return q.toString("field"); + } + +} + +class ASCIIAnalyzer extends org.apache.lucene.analysis.Analyzer { + public ASCIIAnalyzer() { + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(reader); + result = new StandardFilter(result); + result = new ISOLatin1AccentFilter(result); + result = new LowerCaseFilter(result); + return result; + } +}