mirror of https://github.com/apache/lucene.git
LUCENE-949: AnalyzingQueryParser can't work with leading wildcards.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1479710 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
67e922c392
commit
d58de0096a
|
@ -103,6 +103,9 @@ Bug Fixes
|
||||||
* LUCENE-4972: DirectoryTaxonomyWriter created empty commits even if no changes
|
* LUCENE-4972: DirectoryTaxonomyWriter created empty commits even if no changes
|
||||||
were made. (Shai Erera, Michael McCandless)
|
were made. (Shai Erera, Michael McCandless)
|
||||||
|
|
||||||
|
* LUCENE-949: AnalyzingQueryParser can't work with leading wildcards.
|
||||||
|
(Tim Allison, Robert Muir, Steve Rowe)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
|
* LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
|
||||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.queryparser.analyzing;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
import java.util.regex.Matcher;
|
||||||
import java.util.List;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -31,36 +31,29 @@ import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
|
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
|
||||||
* are also passed through the given analyzer, but wild card characters (like <code>*</code>)
|
* are also passed through the given analyzer, but wildcard characters <code>*</code> and
|
||||||
* don't get removed from the search terms.
|
* <code>?</code> don't get removed from the search terms.
|
||||||
*
|
*
|
||||||
* <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
|
* <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
|
||||||
* or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
|
* or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
|
||||||
* will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
|
* will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
|
||||||
* become <code>h?user</code> when using this parser and thus no match would be found (i.e.
|
* become <code>h?user</code> when using this parser and thus no match would be found (i.e.
|
||||||
* using this parser will be no improvement over QueryParser in such cases).
|
* using this parser will be no improvement over QueryParser in such cases).
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
|
public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
|
||||||
|
// gobble escaped chars or find a wildcard character
|
||||||
/**
|
private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)");
|
||||||
* Constructs a query parser.
|
|
||||||
* @param field the default field for query terms.
|
|
||||||
* @param analyzer used to find terms in the query text.
|
|
||||||
*/
|
|
||||||
public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
|
public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
|
||||||
super(matchVersion, field, analyzer);
|
super(matchVersion, field, analyzer);
|
||||||
setAnalyzeRangeTerms(true);
|
setAnalyzeRangeTerms(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when parser
|
* Called when parser parses an input term that contains one or more wildcard
|
||||||
* parses an input term token that contains one or more wildcard
|
* characters (like <code>*</code>), but is not a prefix term (one that has
|
||||||
* characters (like <code>*</code>), but is not a prefix term token (one
|
* just a single <code>*</code> character at the end).
|
||||||
* that has just a single * character at the end).
|
|
||||||
* <p>
|
* <p>
|
||||||
* Example: will be called for <code>H?user</code> or for <code>H*user</code>
|
* Example: will be called for <code>H?user</code> or for <code>H*user</code>.
|
||||||
* but not for <code>*user</code>.
|
|
||||||
* <p>
|
* <p>
|
||||||
* Depending on analyzer and settings, a wildcard term may (most probably will)
|
* Depending on analyzer and settings, a wildcard term may (most probably will)
|
||||||
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
|
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
|
||||||
|
@ -68,113 +61,52 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
||||||
* Overrides super class, by passing terms through analyzer.
|
* Overrides super class, by passing terms through analyzer.
|
||||||
*
|
*
|
||||||
* @param field Name of the field query will use.
|
* @param field Name of the field query will use.
|
||||||
* @param termStr Term token that contains one or more wild card
|
* @param termStr Term that contains one or more wildcard
|
||||||
* characters (? or *), but is not simple prefix term
|
* characters (? or *), but is not simple prefix term
|
||||||
*
|
*
|
||||||
* @return Resulting {@link Query} built for the term
|
* @return Resulting {@link Query} built for the term
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
|
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
|
||||||
List<String> tlist = new ArrayList<String>();
|
|
||||||
List<String> wlist = new ArrayList<String>();
|
|
||||||
/* somewhat a hack: find/store wildcard chars
|
|
||||||
* in order to put them back after analyzing */
|
|
||||||
boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
|
|
||||||
StringBuilder tmpBuffer = new StringBuilder();
|
|
||||||
char[] chars = termStr.toCharArray();
|
|
||||||
for (int i = 0; i < termStr.length(); i++) {
|
|
||||||
if (chars[i] == '?' || chars[i] == '*') {
|
|
||||||
if (isWithinToken) {
|
|
||||||
tlist.add(tmpBuffer.toString());
|
|
||||||
tmpBuffer.setLength(0);
|
|
||||||
}
|
|
||||||
isWithinToken = false;
|
|
||||||
} else {
|
|
||||||
if (!isWithinToken) {
|
|
||||||
wlist.add(tmpBuffer.toString());
|
|
||||||
tmpBuffer.setLength(0);
|
|
||||||
}
|
|
||||||
isWithinToken = true;
|
|
||||||
}
|
|
||||||
tmpBuffer.append(chars[i]);
|
|
||||||
}
|
|
||||||
if (isWithinToken) {
|
|
||||||
tlist.add(tmpBuffer.toString());
|
|
||||||
} else {
|
|
||||||
wlist.add(tmpBuffer.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
// get Analyzer from superclass and tokenize the term
|
if (termStr == null){
|
||||||
TokenStream source;
|
//can't imagine this would ever happen
|
||||||
|
throw new ParseException("Passed null value as term to getWildcardQuery");
|
||||||
|
}
|
||||||
|
if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) {
|
||||||
|
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
|
||||||
|
+ " unless getAllowLeadingWildcard() returns true");
|
||||||
|
}
|
||||||
|
|
||||||
int countTokens = 0;
|
Matcher wildcardMatcher = wildcardPattern.matcher(termStr);
|
||||||
try {
|
StringBuilder sb = new StringBuilder();
|
||||||
source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
int last = 0;
|
||||||
source.reset();
|
|
||||||
} catch (IOException e1) {
|
while (wildcardMatcher.find()){
|
||||||
throw new RuntimeException(e1);
|
// continue if escaped char
|
||||||
}
|
if (wildcardMatcher.group(1) != null){
|
||||||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
continue;
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
if (!source.incrementToken()) break;
|
|
||||||
} catch (IOException e) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
String term = termAtt.toString();
|
|
||||||
if (!"".equals(term)) {
|
if (wildcardMatcher.start() > 0){
|
||||||
try {
|
String chunk = termStr.substring(last, wildcardMatcher.start());
|
||||||
tlist.set(countTokens++, term);
|
String analyzed = analyzeSingleChunk(field, termStr, chunk);
|
||||||
} catch (IndexOutOfBoundsException ioobe) {
|
sb.append(analyzed);
|
||||||
countTokens = -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
//append the wildcard character
|
||||||
|
sb.append(wildcardMatcher.group(2));
|
||||||
|
|
||||||
|
last = wildcardMatcher.end();
|
||||||
}
|
}
|
||||||
try {
|
if (last < termStr.length()){
|
||||||
source.end();
|
sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last)));
|
||||||
source.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
|
|
||||||
if (countTokens != tlist.size()) {
|
|
||||||
/* this means that the analyzer used either added or consumed
|
|
||||||
* (common for a stemmer) tokens, and we can't build a WildcardQuery */
|
|
||||||
throw new ParseException("Cannot build WildcardQuery with analyzer "
|
|
||||||
+ getAnalyzer().getClass() + " - tokens added or lost");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tlist.size() == 0) {
|
|
||||||
return null;
|
|
||||||
} else if (tlist.size() == 1) {
|
|
||||||
if (wlist != null && wlist.size() == 1) {
|
|
||||||
/* if wlist contains one wildcard, it must be at the end, because:
|
|
||||||
* 1) wildcards are not allowed in 1st position of a term by QueryParser
|
|
||||||
* 2) if wildcard was *not* in end, there would be *two* or more tokens */
|
|
||||||
return super.getWildcardQuery(field, tlist.get(0)
|
|
||||||
+ wlist.get(0).toString());
|
|
||||||
} else {
|
|
||||||
/* we should never get here! if so, this method was called
|
|
||||||
* with a termStr containing no wildcard ... */
|
|
||||||
throw new IllegalArgumentException("getWildcardQuery called without wildcard");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* the term was tokenized, let's rebuild to one token
|
|
||||||
* with wildcards put back in postion */
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (int i = 0; i < tlist.size(); i++) {
|
|
||||||
sb.append( tlist.get(i));
|
|
||||||
if (wlist != null && wlist.size() > i) {
|
|
||||||
sb.append(wlist.get(i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return super.getWildcardQuery(field, sb.toString());
|
|
||||||
}
|
}
|
||||||
|
return super.getWildcardQuery(field, sb.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when parser parses an input term
|
* Called when parser parses an input term
|
||||||
* token that uses prefix notation; that is, contains a single '*' wildcard
|
* that uses prefix notation; that is, contains a single '*' wildcard
|
||||||
* character as its last character. Since this is a special case
|
* character as its last character. Since this is a special case
|
||||||
* of generic wildcard term, and such a query can be optimized easily,
|
* of generic wildcard term, and such a query can be optimized easily,
|
||||||
* this usually results in a different query object.
|
* this usually results in a different query object.
|
||||||
|
@ -185,52 +117,19 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
||||||
* Overrides super class, by passing terms through analyzer.
|
* Overrides super class, by passing terms through analyzer.
|
||||||
*
|
*
|
||||||
* @param field Name of the field query will use.
|
* @param field Name of the field query will use.
|
||||||
* @param termStr Term token to use for building term for the query
|
* @param termStr Term to use for building term for the query
|
||||||
* (<b>without</b> trailing '*' character!)
|
* (<b>without</b> trailing '*' character!)
|
||||||
*
|
*
|
||||||
* @return Resulting {@link Query} built for the term
|
* @return Resulting {@link Query} built for the term
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
|
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
|
||||||
// get Analyzer from superclass and tokenize the term
|
String analyzed = analyzeSingleChunk(field, termStr, termStr);
|
||||||
TokenStream source;
|
return super.getPrefixQuery(field, analyzed);
|
||||||
List<String> tlist = new ArrayList<String>();
|
|
||||||
try {
|
|
||||||
source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
|
||||||
source.reset();
|
|
||||||
} catch (IOException e1) {
|
|
||||||
throw new RuntimeException(e1);
|
|
||||||
}
|
|
||||||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
if (!source.incrementToken()) break;
|
|
||||||
} catch (IOException e) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
tlist.add(termAtt.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
source.end();
|
|
||||||
source.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tlist.size() == 1) {
|
|
||||||
return super.getPrefixQuery(field, tlist.get(0));
|
|
||||||
} else {
|
|
||||||
/* this means that the analyzer used either added or consumed
|
|
||||||
* (common for a stemmer) tokens, and we can't build a PrefixQuery */
|
|
||||||
throw new ParseException("Cannot build PrefixQuery with analyzer "
|
|
||||||
+ getAnalyzer().getClass()
|
|
||||||
+ (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when parser parses an input term token that has the fuzzy suffix (~) appended.
|
* Called when parser parses an input term that has the fuzzy suffix (~) appended.
|
||||||
* <p>
|
* <p>
|
||||||
* Depending on analyzer and settings, a fuzzy term may (most probably will)
|
* Depending on analyzer and settings, a fuzzy term may (most probably will)
|
||||||
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
|
* be lower-cased automatically. It <b>will</b> go through the default Analyzer.
|
||||||
|
@ -238,42 +137,73 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
||||||
* Overrides super class, by passing terms through analyzer.
|
* Overrides super class, by passing terms through analyzer.
|
||||||
*
|
*
|
||||||
* @param field Name of the field query will use.
|
* @param field Name of the field query will use.
|
||||||
* @param termStr Term token to use for building term for the query
|
* @param termStr Term to use for building term for the query
|
||||||
*
|
*
|
||||||
* @return Resulting {@link Query} built for the term
|
* @return Resulting {@link Query} built for the term
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
|
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
|
||||||
throws ParseException {
|
throws ParseException {
|
||||||
// get Analyzer from superclass and tokenize the term
|
|
||||||
TokenStream source = null;
|
String analyzed = analyzeSingleChunk(field, termStr, termStr);
|
||||||
String nextToken = null;
|
return super.getFuzzyQuery(field, analyzed, minSimilarity);
|
||||||
boolean multipleTokens = false;
|
}
|
||||||
|
|
||||||
try {
|
/**
|
||||||
source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
* Returns the analyzed form for the given chunk
|
||||||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
*
|
||||||
source.reset();
|
* If the analyzer produces more than one output token from the given chunk,
|
||||||
if (source.incrementToken()) {
|
* a ParseException is thrown.
|
||||||
nextToken = termAtt.toString();
|
*
|
||||||
|
* @param field The target field
|
||||||
|
* @param termStr The full term from which the given chunk is excerpted
|
||||||
|
* @param chunk The portion of the given termStr to be analyzed
|
||||||
|
* @return The result of analyzing the given chunk
|
||||||
|
* @throws ParseException when analysis returns other than one output token
|
||||||
|
*/
|
||||||
|
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
|
||||||
|
String analyzed = null;
|
||||||
|
TokenStream stream = null;
|
||||||
|
try{
|
||||||
|
stream = getAnalyzer().tokenStream(field, new StringReader(chunk));
|
||||||
|
stream.reset();
|
||||||
|
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||||
|
// get first and hopefully only output token
|
||||||
|
if (stream.incrementToken()) {
|
||||||
|
analyzed = termAtt.toString();
|
||||||
|
|
||||||
|
// try to increment again, there should only be one output token
|
||||||
|
StringBuilder multipleOutputs = null;
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
if (null == multipleOutputs) {
|
||||||
|
multipleOutputs = new StringBuilder();
|
||||||
|
multipleOutputs.append('"');
|
||||||
|
multipleOutputs.append(analyzed);
|
||||||
|
multipleOutputs.append('"');
|
||||||
|
}
|
||||||
|
multipleOutputs.append(',');
|
||||||
|
multipleOutputs.append('"');
|
||||||
|
multipleOutputs.append(termAtt.toString());
|
||||||
|
multipleOutputs.append('"');
|
||||||
|
}
|
||||||
|
stream.end();
|
||||||
|
stream.close();
|
||||||
|
if (null != multipleOutputs) {
|
||||||
|
throw new ParseException(
|
||||||
|
String.format(getLocale(),
|
||||||
|
"Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// nothing returned by analyzer. Was it a stop word and the user accidentally
|
||||||
|
// used an analyzer with stop words?
|
||||||
|
stream.end();
|
||||||
|
stream.close();
|
||||||
|
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
|
||||||
}
|
}
|
||||||
multipleTokens = source.incrementToken();
|
} catch (IOException e){
|
||||||
} catch (IOException e) {
|
throw new ParseException(
|
||||||
nextToken = null;
|
String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
|
||||||
}
|
}
|
||||||
|
return analyzed;
|
||||||
try {
|
|
||||||
source.end();
|
|
||||||
source.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
|
|
||||||
if (multipleTokens) {
|
|
||||||
throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
|
|
||||||
+ " - tokens were added");
|
|
||||||
}
|
|
||||||
|
|
||||||
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,17 @@ package org.apache.lucene.queryparser.analyzing;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockBytesAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -33,11 +42,14 @@ import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*/
|
*/
|
||||||
|
@SuppressCodecs("Lucene3x") // binary terms
|
||||||
public class TestAnalyzingQueryParser extends LuceneTestCase {
|
public class TestAnalyzingQueryParser extends LuceneTestCase {
|
||||||
|
private final static String FIELD = "field";
|
||||||
|
|
||||||
private Analyzer a;
|
private Analyzer a;
|
||||||
|
|
||||||
private String[] wildcardInput;
|
private String[] wildcardInput;
|
||||||
|
@ -49,12 +61,15 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
|
||||||
private String[] fuzzyInput;
|
private String[] fuzzyInput;
|
||||||
private String[] fuzzyExpected;
|
private String[] fuzzyExpected;
|
||||||
|
|
||||||
|
private Map<String, String> wildcardEscapeHits = new TreeMap<String, String>();
|
||||||
|
private Map<String, String> wildcardEscapeMisses = new TreeMap<String, String>();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
wildcardInput = new String[] { "übersetzung über*ung",
|
wildcardInput = new String[] { "*bersetzung über*ung",
|
||||||
"Mötley Cr\u00fce Mötl?* Crü?", "Renée Zellweger Ren?? Zellw?ger" };
|
"Mötley Cr\u00fce Mötl?* Crü?", "Renée Zellweger Ren?? Zellw?ger" };
|
||||||
wildcardExpected = new String[] { "ubersetzung uber*ung", "motley crue motl?* cru?",
|
wildcardExpected = new String[] { "*bersetzung uber*ung", "motley crue motl?* cru?",
|
||||||
"renee zellweger ren?? zellw?ger" };
|
"renee zellweger ren?? zellw?ger" };
|
||||||
|
|
||||||
prefixInput = new String[] { "übersetzung übersetz*",
|
prefixInput = new String[] { "übersetzung übersetz*",
|
||||||
|
@ -71,43 +86,138 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
|
||||||
fuzzyExpected = new String[] { "ubersetzung ubersetzung~1",
|
fuzzyExpected = new String[] { "ubersetzung ubersetzung~1",
|
||||||
"motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" };
|
"motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" };
|
||||||
|
|
||||||
|
wildcardEscapeHits.put("mö*tley", "moatley");
|
||||||
|
|
||||||
|
// need to have at least one genuine wildcard to trigger the wildcard analysis
|
||||||
|
// hence the * before the y
|
||||||
|
wildcardEscapeHits.put("mö\\*tl*y", "mo*tley");
|
||||||
|
|
||||||
|
// escaped backslash then true wildcard
|
||||||
|
wildcardEscapeHits.put("mö\\\\*tley", "mo\\atley");
|
||||||
|
|
||||||
|
// escaped wildcard then true wildcard
|
||||||
|
wildcardEscapeHits.put("mö\\??ley", "mo?tley");
|
||||||
|
|
||||||
|
// the first is an escaped * which should yield a miss
|
||||||
|
wildcardEscapeMisses.put("mö\\*tl*y", "moatley");
|
||||||
|
|
||||||
a = new ASCIIAnalyzer();
|
a = new ASCIIAnalyzer();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSingleChunkExceptions() {
|
||||||
|
boolean ex = false;
|
||||||
|
String termStr = "the*tre";
|
||||||
|
|
||||||
|
Analyzer stopsAnalyzer = new MockAnalyzer
|
||||||
|
(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
|
try {
|
||||||
|
String q = parseWithAnalyzingQueryParser(termStr, stopsAnalyzer, true);
|
||||||
|
} catch (ParseException e){
|
||||||
|
if (e.getMessage().contains("returned nothing")){
|
||||||
|
ex = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals("Should have returned nothing", true, ex);
|
||||||
|
ex = false;
|
||||||
|
|
||||||
|
AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, a);
|
||||||
|
try{
|
||||||
|
qp.analyzeSingleChunk(FIELD, "", "not a single chunk");
|
||||||
|
} catch (ParseException e){
|
||||||
|
if (e.getMessage().contains("multiple terms")){
|
||||||
|
ex = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals("Should have produced multiple terms", true, ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWildcardAlone() throws ParseException {
|
||||||
|
//seems like crazy edge case, but can be useful in concordance
|
||||||
|
boolean pex = false;
|
||||||
|
try{
|
||||||
|
Query q = getAnalyzedQuery("*", a, false);
|
||||||
|
} catch (ParseException e){
|
||||||
|
pex = true;
|
||||||
|
}
|
||||||
|
assertEquals("Wildcard alone with allowWildcard=false", true, pex);
|
||||||
|
|
||||||
|
pex = false;
|
||||||
|
try {
|
||||||
|
String qString = parseWithAnalyzingQueryParser("*", a, true);
|
||||||
|
assertEquals("Every word", "*", qString);
|
||||||
|
} catch (ParseException e){
|
||||||
|
pex = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals("Wildcard alone with allowWildcard=true", false, pex);
|
||||||
|
|
||||||
|
}
|
||||||
|
public void testWildCardEscapes() throws ParseException, IOException {
|
||||||
|
|
||||||
|
for (Map.Entry<String, String> entry : wildcardEscapeHits.entrySet()){
|
||||||
|
Query q = getAnalyzedQuery(entry.getKey(), a, false);
|
||||||
|
assertEquals("WildcardEscapeHits: " + entry.getKey(), true, isAHit(q, entry.getValue(), a));
|
||||||
|
}
|
||||||
|
for (Map.Entry<String, String> entry : wildcardEscapeMisses.entrySet()){
|
||||||
|
Query q = getAnalyzedQuery(entry.getKey(), a, false);
|
||||||
|
assertEquals("WildcardEscapeMisses: " + entry.getKey(), false, isAHit(q, entry.getValue(), a));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
public void testWildCardQueryNoLeadingAllowed() {
|
||||||
|
boolean ex = false;
|
||||||
|
try{
|
||||||
|
String q = parseWithAnalyzingQueryParser(wildcardInput[0], a, false);
|
||||||
|
|
||||||
|
} catch (ParseException e){
|
||||||
|
ex = true;
|
||||||
|
}
|
||||||
|
assertEquals("Testing initial wildcard not allowed",
|
||||||
|
true, ex);
|
||||||
|
}
|
||||||
|
|
||||||
public void testWildCardQuery() throws ParseException {
|
public void testWildCardQuery() throws ParseException {
|
||||||
for (int i = 0; i < wildcardInput.length; i++) {
|
for (int i = 0; i < wildcardInput.length; i++) {
|
||||||
assertEquals("Testing wildcards with analyzer " + a.getClass() + ", input string: "
|
assertEquals("Testing wildcards with analyzer " + a.getClass() + ", input string: "
|
||||||
+ wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a));
|
+ wildcardInput[i], wildcardExpected[i], parseWithAnalyzingQueryParser(wildcardInput[i], a, true));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void testPrefixQuery() throws ParseException {
|
public void testPrefixQuery() throws ParseException {
|
||||||
for (int i = 0; i < prefixInput.length; i++) {
|
for (int i = 0; i < prefixInput.length; i++) {
|
||||||
assertEquals("Testing prefixes with analyzer " + a.getClass() + ", input string: "
|
assertEquals("Testing prefixes with analyzer " + a.getClass() + ", input string: "
|
||||||
+ prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a));
|
+ prefixInput[i], prefixExpected[i], parseWithAnalyzingQueryParser(prefixInput[i], a, false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRangeQuery() throws ParseException {
|
public void testRangeQuery() throws ParseException {
|
||||||
for (int i = 0; i < rangeInput.length; i++) {
|
for (int i = 0; i < rangeInput.length; i++) {
|
||||||
assertEquals("Testing ranges with analyzer " + a.getClass() + ", input string: "
|
assertEquals("Testing ranges with analyzer " + a.getClass() + ", input string: "
|
||||||
+ rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a));
|
+ rangeInput[i], rangeExpected[i], parseWithAnalyzingQueryParser(rangeInput[i], a, false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFuzzyQuery() throws ParseException {
|
public void testFuzzyQuery() throws ParseException {
|
||||||
for (int i = 0; i < fuzzyInput.length; i++) {
|
for (int i = 0; i < fuzzyInput.length; i++) {
|
||||||
assertEquals("Testing fuzzys with analyzer " + a.getClass() + ", input string: "
|
assertEquals("Testing fuzzys with analyzer " + a.getClass() + ", input string: "
|
||||||
+ fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a));
|
+ fuzzyInput[i], fuzzyExpected[i], parseWithAnalyzingQueryParser(fuzzyInput[i], a, false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String parseWithAnalyzingQueryParser(String s, Analyzer a) throws ParseException {
|
|
||||||
AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "field", a);
|
private String parseWithAnalyzingQueryParser(String s, Analyzer a, boolean allowLeadingWildcard) throws ParseException {
|
||||||
org.apache.lucene.search.Query q = qp.parse(s);
|
Query q = getAnalyzedQuery(s, a, allowLeadingWildcard);
|
||||||
return q.toString("field");
|
return q.toString(FIELD);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Query getAnalyzedQuery(String s, Analyzer a, boolean allowLeadingWildcard) throws ParseException {
|
||||||
|
AnalyzingQueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, a);
|
||||||
|
qp.setAllowLeadingWildcard(allowLeadingWildcard);
|
||||||
|
org.apache.lucene.search.Query q = qp.parse(s);
|
||||||
|
return q;
|
||||||
|
}
|
||||||
|
|
||||||
final static class FoldingFilter extends TokenFilter {
|
final static class FoldingFilter extends TokenFilter {
|
||||||
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
@ -144,31 +254,45 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
|
||||||
final static class ASCIIAnalyzer extends Analyzer {
|
final static class ASCIIAnalyzer extends Analyzer {
|
||||||
@Override
|
@Override
|
||||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
|
Tokenizer result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
||||||
return new TokenStreamComponents(result, new FoldingFilter(result));
|
return new TokenStreamComponents(result, new FoldingFilter(result));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// LUCENE-4176
|
// LUCENE-4176
|
||||||
public void testByteTerms() throws Exception {
|
public void testByteTerms() throws Exception {
|
||||||
Directory ramDir = newDirectory();
|
String s = "เข";
|
||||||
Analyzer analyzer = new MockBytesAnalyzer();
|
Analyzer analyzer = new MockBytesAnalyzer();
|
||||||
|
QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, FIELD, analyzer);
|
||||||
|
Query q = qp.parse("[เข TO เข]");
|
||||||
|
assertEquals(true, isAHit(q, s, analyzer));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isAHit(Query q, String content, Analyzer analyzer) throws IOException{
|
||||||
|
Directory ramDir = newDirectory();
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer);
|
RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer);
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
FieldType fieldType = new FieldType();
|
FieldType fieldType = new FieldType();
|
||||||
fieldType.setIndexed(true);
|
fieldType.setIndexed(true);
|
||||||
fieldType.setTokenized(true);
|
fieldType.setTokenized(true);
|
||||||
fieldType.setStored(true);
|
fieldType.setStored(true);
|
||||||
Field field = new Field("content","เข", fieldType);
|
Field field = new Field(FIELD, content, fieldType);
|
||||||
doc.add(field);
|
doc.add(field);
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
writer.close();
|
writer.close();
|
||||||
DirectoryReader ir = DirectoryReader.open(ramDir);
|
DirectoryReader ir = DirectoryReader.open(ramDir);
|
||||||
IndexSearcher is = newSearcher(ir);
|
IndexSearcher is = new IndexSearcher(ir);
|
||||||
QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "content", analyzer);
|
|
||||||
Query q = qp.parse("[เข TO เข]");
|
int hits = is.search(q, 10).totalHits;
|
||||||
assertEquals(1, is.search(q, 10).totalHits);
|
|
||||||
ir.close();
|
ir.close();
|
||||||
ramDir.close();
|
ramDir.close();
|
||||||
|
if (hits == 1){
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue