From dfed16f2fddce83fc544584c0fca1f5ba6d354a7 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 28 Sep 2010 15:11:12 +0000 Subject: [PATCH] LUCENE-2667: Improve defaults for FuzzyQuery so it has good performance git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1002214 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 8 ++ .../search/highlight/HighlighterTest.java | 2 +- .../core/messages/QueryParserMessages.java | 1 + .../queryParser/precedence/CharStream.java | 2 +- .../precedence/ParseException.java | 2 +- .../precedence/PrecedenceQueryParser.java | 6 +- .../precedence/PrecedenceQueryParser.jj | 6 +- .../lucene/queryParser/precedence/Token.java | 2 +- .../queryParser/precedence/TokenMgrError.java | 2 +- .../standard/parser/JavaCharStream.java | 2 +- .../standard/parser/ParseException.java | 2 +- .../standard/parser/StandardSyntaxParser.java | 4 +- .../standard/parser/StandardSyntaxParser.jj | 4 +- .../queryParser/standard/parser/Token.java | 2 +- .../standard/parser/TokenMgrError.java | 2 +- .../messages/QueryParserMessages.properties | 3 + .../analyzing/TestAnalyzingQueryParser.java | 2 +- .../precedence/TestPrecedenceQueryParser.java | 8 +- .../apache/lucene/queryParser/CharStream.java | 2 +- .../lucene/queryParser/ParseException.java | 2 +- .../lucene/queryParser/QueryParser.java | 6 +- .../apache/lucene/queryParser/QueryParser.jj | 6 +- .../org/apache/lucene/queryParser/Token.java | 2 +- .../lucene/queryParser/TokenMgrError.java | 2 +- .../org/apache/lucene/search/FuzzyQuery.java | 30 ++++---- .../apache/lucene/search/FuzzyTermsEnum.java | 41 +++++----- .../TestMultiFieldQueryParser.java | 4 +- .../lucene/queryParser/TestQueryParser.java | 22 +++--- .../apache/lucene/search/TestFuzzyQuery.java | 76 +++++++++++++++---- 29 files changed, 167 insertions(+), 86 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1c78069e70c..ca09073da2f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -113,6 +113,14 @@ Changes in backwards compatibility policy If you index empty fields and uses positions/offsets information on that fields, reindex is recommended. (David Smiley, Koji Sekiguchi) +* LUCENE-2667: FuzzyQuery's defaults have changed for more performant + behavior: the minimum similarity is 2 edit distances from the word, + and the priority queue size is 50. To support this, FuzzyQuery now allows + specifying unscaled edit distances (foobar~2). If your application depends + upon the old defaults of 0.5 (scaled) minimum similarity and Integer.MAX_VALUE + priority queue size, you can use FuzzyQuery(Term, float, int, int) to specify + those explicitly. + Changes in Runtime Behavior * LUCENE-2650: The behavior of FSDirectory.open has changed. On 64-bit diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index 61beab80994..51b2d274ce0 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -614,7 +614,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("Kinnedy~"); + doSearching("Kinnedy~0.5"); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/core/messages/QueryParserMessages.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/core/messages/QueryParserMessages.java index 38a42125885..50428b0b36f 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/core/messages/QueryParserMessages.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/core/messages/QueryParserMessages.java @@ -40,6 +40,7 @@ public class QueryParserMessages extends NLS { public static String INVALID_SYNTAX; public static String INVALID_SYNTAX_CANNOT_PARSE; public static String INVALID_SYNTAX_FUZZY_LIMITS; + public static String INVALID_SYNTAX_FUZZY_EDITS; public static String INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION; public static String INVALID_SYNTAX_ESCAPE_CHARACTER; public static String INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE; diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/CharStream.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/CharStream.java index ca370ca85bd..9a84eb5ba75 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/CharStream.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/CharStream.java @@ -109,4 +109,4 @@ public interface CharStream { void Done(); } -/* JavaCC - OriginalChecksum=7bcd45d10a032f1c9da64691d073cf75 (do not edit this line) */ +/* JavaCC - OriginalChecksum=8cc617b193267dc876ef9699367c8186 (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/ParseException.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/ParseException.java index 3f197ceb943..6e9ec487912 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/ParseException.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/ParseException.java @@ -195,4 +195,4 @@ public class ParseException extends Exception { } } -/* JavaCC - OriginalChecksum=4440e368eeef562faffeca98a200334b (do not edit this line) */ +/* JavaCC - OriginalChecksum=15fbbe38a36c8ac9e2740d030624c321 (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java index 70440427d07..ea53d12575b 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java @@ -164,7 +164,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants { /** * Set the minimum similarity for fuzzy queries. - * Default is 0.5f. + * Default is 2f. */ public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; @@ -927,8 +927,10 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants { try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ {if (true) throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !");} + } else if (fms >= 1.0f && fms != (int) fms) { + {if (true) throw new ParseException("Fractional edit distances are not allowed!");} } q = getFuzzyQuery(field, termImage, fms); } else { diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj index 407f1acb48e..11523cc86e8 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj @@ -188,7 +188,7 @@ public class PrecedenceQueryParser { /** * Set the minimum similarity for fuzzy queries. - * Default is 0.5f. + * Default is 2f. */ public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; @@ -905,8 +905,10 @@ Query Term(String field) : { try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); + } else if (fms >= 1.0f && fms != (int) fms) { + throw new ParseException("Fractional edit distances are not allowed!"); } q = getFuzzyQuery(field, termImage, fms); } else { diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/Token.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/Token.java index 7383a35eefb..8402b3d5017 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/Token.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/Token.java @@ -121,4 +121,4 @@ public class Token { } } -/* JavaCC - OriginalChecksum=bc9495ddfa3189061fb4f1bf3c4f64e2 (do not edit this line) */ +/* JavaCC - OriginalChecksum=0dc5808f2ab8aac8775ea9175fa2cb51 (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/TokenMgrError.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/TokenMgrError.java index e29d561af23..01e87510c8f 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/TokenMgrError.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/precedence/TokenMgrError.java @@ -138,4 +138,4 @@ public class TokenMgrError extends Error this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); } } -/* JavaCC - OriginalChecksum=e01667f2eb6d0b2f1fbb6958df0ca751 (do not edit this line) */ +/* JavaCC - OriginalChecksum=257b82f2650841e86289a309cb3dae76 (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/JavaCharStream.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/JavaCharStream.java index 9ce9050bfec..6c0bab932b2 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/JavaCharStream.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/JavaCharStream.java @@ -613,4 +613,4 @@ public class JavaCharStream } } -/* JavaCC - OriginalChecksum=31519f95b41182c6740c2afd8dfbf344 (do not edit this line) */ +/* JavaCC - OriginalChecksum=f19c73b8f7faf94cc4a581e7b2933cc6 (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/ParseException.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/ParseException.java index 5336b828635..eee1116dccf 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/ParseException.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/ParseException.java @@ -193,4 +193,4 @@ public class ParseException extends QueryNodeParseException { } } -/* JavaCC - OriginalChecksum=d0caeac083e9874065f9d1e298b5ccd9 (do not edit this line) */ +/* JavaCC - OriginalChecksum=38bce846fe6c8482993969f741c0323e (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.java index 78b913966e4..b1657601823 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.java @@ -433,8 +433,10 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ {if (true) throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS));} + } else if (fms >= 1.0f && fms != (int) fms) { + {if (true) throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_EDITS));} } q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn); } else if (regexp) { diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.jj b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.jj index de0b364f167..7aed9f1e6b8 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.jj +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.jj @@ -396,8 +396,10 @@ QueryNode Term(CharSequence field) : { try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS)); + } else if (fms >= 1.0f && fms != (int) fms) { + throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_EDITS)); } q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn); } else if (regexp) { diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/Token.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/Token.java index da97d86decc..cb0e250a0eb 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/Token.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/Token.java @@ -121,4 +121,4 @@ public class Token { } } -/* JavaCC - OriginalChecksum=cecb6022e0f2e2fca751015375f6d319 (do not edit this line) */ +/* JavaCC - OriginalChecksum=0aac6816ecd328eda2f38b9d09739ab6 (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/TokenMgrError.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/TokenMgrError.java index 06d602ecad5..bfe8feea01c 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/TokenMgrError.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/TokenMgrError.java @@ -138,4 +138,4 @@ public class TokenMgrError extends Error this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); } } -/* JavaCC - OriginalChecksum=0e9c5fad06efef4f41f97b851ac7b0ce (do not edit this line) */ +/* JavaCC - OriginalChecksum=a75b5b61664a73631a032a6e44f4b38a (do not edit this line) */ diff --git a/lucene/contrib/queryparser/src/resources/org/apache/lucene/queryParser/core/messages/QueryParserMessages.properties b/lucene/contrib/queryparser/src/resources/org/apache/lucene/queryParser/core/messages/QueryParserMessages.properties index 6d7370bcc66..f732dde9d64 100644 --- a/lucene/contrib/queryparser/src/resources/org/apache/lucene/queryParser/core/messages/QueryParserMessages.properties +++ b/lucene/contrib/queryparser/src/resources/org/apache/lucene/queryParser/core/messages/QueryParserMessages.properties @@ -12,6 +12,9 @@ INVALID_SYNTAX_CANNOT_PARSE = Syntax Error, cannot parse {0}: {1} #Apache Lucene Community INVALID_SYNTAX_FUZZY_LIMITS = The similarity value for a fuzzy search must be between 0.0 and 1.0. +#Apache Lucene Community +INVALID_SYNTAX_FUZZY_EDITS = Fractional edit distances are not allowed. + #Apache Lucene Community INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION = Truncated unicode escape sequence. diff --git a/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java b/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java index 6e0143ae5b8..a8817fe715b 100644 --- a/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java +++ b/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/analyzing/TestAnalyzingQueryParser.java @@ -64,7 +64,7 @@ public class TestAnalyzingQueryParser extends LuceneTestCase { "Mötley Crüe Mötley~0.75 Crüe~0.5", "Renée Zellweger Renée~0.9 Zellweger~" }; fuzzyExpected = new String[] { "ubersetzung ubersetzung~0.9", - "motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~0.5" }; + "motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~2.0" }; a = new ASCIIAnalyzer(); } diff --git a/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java b/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java index ab4c472e76d..32654252035 100644 --- a/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java +++ b/lucene/contrib/queryparser/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java @@ -543,10 +543,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase { assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); - assertQueryEquals("a:b\\-c~", a, "a:b-c~0.5"); - assertQueryEquals("a:b\\+c~", a, "a:b+c~0.5"); - assertQueryEquals("a:b\\:c~", a, "a:b:c~0.5"); - assertQueryEquals("a:b\\\\c~", a, "a:b\\c~0.5"); + assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); + assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); + assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); + assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]"); diff --git a/lucene/src/java/org/apache/lucene/queryParser/CharStream.java b/lucene/src/java/org/apache/lucene/queryParser/CharStream.java index 9e546d50a41..4423996dadf 100644 --- a/lucene/src/java/org/apache/lucene/queryParser/CharStream.java +++ b/lucene/src/java/org/apache/lucene/queryParser/CharStream.java @@ -109,4 +109,4 @@ public interface CharStream { void Done(); } -/* JavaCC - OriginalChecksum=a83909a2403f969f94d18375f9f143e4 (do not edit this line) */ +/* JavaCC - OriginalChecksum=32a89423891f765dde472f7ef0e3ef7b (do not edit this line) */ diff --git a/lucene/src/java/org/apache/lucene/queryParser/ParseException.java b/lucene/src/java/org/apache/lucene/queryParser/ParseException.java index fdb47847a38..b48a44644d8 100644 --- a/lucene/src/java/org/apache/lucene/queryParser/ParseException.java +++ b/lucene/src/java/org/apache/lucene/queryParser/ParseException.java @@ -195,4 +195,4 @@ public class ParseException extends Exception { } } -/* JavaCC - OriginalChecksum=c63b396885c4ff44d7aa48d3feae60cd (do not edit this line) */ +/* JavaCC - OriginalChecksum=c7631a240f7446940695eac31d9483ca (do not edit this line) */ diff --git a/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java b/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java index ef824b3f13c..8f0a1f2dd37 100644 --- a/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java +++ b/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java @@ -269,7 +269,7 @@ public class QueryParser implements QueryParserConstants { /** * Set the minimum similarity for fuzzy queries. - * Default is 0.5f. + * Default is 2f. */ public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; @@ -1446,8 +1446,10 @@ public class QueryParser implements QueryParserConstants { try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ {if (true) throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !");} + } else if (fms >= 1.0f && fms != (int) fms) { + {if (true) throw new ParseException("Fractional edit distances are not allowed!");} } q = getFuzzyQuery(field, termImage,fms); } else { diff --git a/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj b/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj index 36f416ac926..39d7c561239 100644 --- a/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj +++ b/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj @@ -293,7 +293,7 @@ public class QueryParser { /** * Set the minimum similarity for fuzzy queries. - * Default is 0.5f. + * Default is 2f. */ public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; @@ -1412,8 +1412,10 @@ Query Term(String field) : { try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } - if(fms < 0.0f || fms > 1.0f){ + if(fms < 0.0f){ throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); + } else if (fms >= 1.0f && fms != (int) fms) { + throw new ParseException("Fractional edit distances are not allowed!"); } q = getFuzzyQuery(field, termImage,fms); } else { diff --git a/lucene/src/java/org/apache/lucene/queryParser/Token.java b/lucene/src/java/org/apache/lucene/queryParser/Token.java index 97677981cd7..2c665d6ab22 100644 --- a/lucene/src/java/org/apache/lucene/queryParser/Token.java +++ b/lucene/src/java/org/apache/lucene/queryParser/Token.java @@ -121,4 +121,4 @@ public class Token { } } -/* JavaCC - OriginalChecksum=37b1923f964a5a434f5ea3d6952ff200 (do not edit this line) */ +/* JavaCC - OriginalChecksum=c147cc166a7cf8812c7c39bc8c5eb868 (do not edit this line) */ diff --git a/lucene/src/java/org/apache/lucene/queryParser/TokenMgrError.java b/lucene/src/java/org/apache/lucene/queryParser/TokenMgrError.java index a3c46b70cab..b4ffd429b2b 100644 --- a/lucene/src/java/org/apache/lucene/queryParser/TokenMgrError.java +++ b/lucene/src/java/org/apache/lucene/queryParser/TokenMgrError.java @@ -138,4 +138,4 @@ public class TokenMgrError extends Error this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); } } -/* JavaCC - OriginalChecksum=334e679cf1a88b3070bb8e3d80ee3f5e (do not edit this line) */ +/* JavaCC - OriginalChecksum=1c94e13236c7e0121e49427992341ee3 (do not edit this line) */ diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java index 4e7fc1231e6..f559728b759 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -21,16 +21,13 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.LevenshteinAutomata; import java.io.IOException; /** Implements the fuzzy search query. The similarity measurement * is based on the Levenshtein (edit distance) algorithm. * - *

Warning: this query is not very scalable with its default prefix - * length of 0 - in this case, *every* term will be enumerated and - * cause an edit score calculation. - * *

This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite} * as default. So terms will be collected and scored according to their * edit distance. Only the top terms are used for building the {@link BooleanQuery}. @@ -38,9 +35,9 @@ import java.io.IOException; */ public class FuzzyQuery extends MultiTermQuery { - public final static float defaultMinSimilarity = 0.5f; + public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; public final static int defaultPrefixLength = 0; - public final static int defaultMaxExpansions = Integer.MAX_VALUE; + public final static int defaultMaxExpansions = 50; private float minimumSimilarity; private int prefixLength; @@ -60,6 +57,12 @@ public class FuzzyQuery extends MultiTermQuery { * minimumSimilarity of 0.5 a term of the same length * as the query term is considered similar to the query term if the edit distance * between both terms is less than length(term)*0.5 + *

+ * Alternatively, if minimumSimilarity is >= 1f, it is interpreted + * as a pure Levenshtein edit distance. For example, a value of 2f + * will match all terms within an edit distance of 2 from the + * query term. Edit distances specified in this way may not be fractional. + * * @param prefixLength length of common (non-fuzzy) prefix * @param maxExpansions the maximum number of terms to match. If this number is * greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, @@ -72,9 +75,9 @@ public class FuzzyQuery extends MultiTermQuery { super(term.field()); this.term = term; - if (minimumSimilarity >= 1.0f) - throw new IllegalArgumentException("minimumSimilarity >= 1"); - else if (minimumSimilarity < 0.0f) + if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity) + throw new IllegalArgumentException("fractional edit distances are not allowed"); + if (minimumSimilarity < 0.0f) throw new IllegalArgumentException("minimumSimilarity < 0"); if (prefixLength < 0) throw new IllegalArgumentException("prefixLength < 0"); @@ -84,7 +87,8 @@ public class FuzzyQuery extends MultiTermQuery { setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions)); String text = term.text(); - if (text.codePointCount(0, text.length()) > 1.0f / (1.0f - minimumSimilarity)) { + int len = text.codePointCount(0, text.length()); + if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) { this.termLongEnough = true; } @@ -93,21 +97,21 @@ public class FuzzyQuery extends MultiTermQuery { } /** - * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, Integer.MAX_VALUE)}. + * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}. */ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { this(term, minimumSimilarity, prefixLength, defaultMaxExpansions); } /** - * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, Integer.MAX_VALUE)}. + * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}. */ public FuzzyQuery(Term term, float minimumSimilarity) { this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions); } /** - * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0, Integer.MAX_VALUE)}. + * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}. */ public FuzzyQuery(Term term) { this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions); diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index b7c84e912ec..0c38aa509be 100644 --- a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -59,6 +59,7 @@ public final class FuzzyTermsEnum extends TermsEnum { private final int termLength; private int maxEdits; + private final boolean raw; private List runAutomata; @@ -77,15 +78,15 @@ public final class FuzzyTermsEnum extends TermsEnum { * * @param reader Delivers terms. * @param term Pattern term. - * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. + * @param minSimilarity Minimum required similarity for terms from the reader. * @param prefixLength Length of required common prefix. Default value is 0. * @throws IOException */ public FuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException { - if (minSimilarity >= 1.0f) - throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1"); - else if (minSimilarity < 0.0f) + if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity) + throw new IllegalArgumentException("fractional edit distances are not allowed"); + if (minSimilarity < 0.0f) throw new IllegalArgumentException("minimumSimilarity cannot be less than 0"); if(prefixLength < 0) throw new IllegalArgumentException("prefixLength cannot be less than 0"); @@ -102,12 +103,19 @@ public final class FuzzyTermsEnum extends TermsEnum { //The prefix could be longer than the word. //It's kind of silly though. It means we must match the entire word. this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength; - this.minSimilarity = minSimilarity; - this.scale_factor = 1.0f / (1.0f - minSimilarity); - - // calculate the maximum k edits for this similarity - maxEdits = initialMaxDistance(minSimilarity, termLength); - + // if minSimilarity >= 1, we treat it as number of edits + if (minSimilarity >= 1f) { + this.minSimilarity = 1 - (minSimilarity+1) / this.termLength; + maxEdits = (int) minSimilarity; + raw = true; + } else { + this.minSimilarity = minSimilarity; + // calculate the maximum k edits for this similarity + maxEdits = initialMaxDistance(this.minSimilarity, termLength); + raw = false; + } + this.scale_factor = 1.0f / (1.0f - this.minSimilarity); + TermsEnum subEnum = getAutomatonEnum(maxEdits, null); setEnum(subEnum != null ? subEnum : new LinearFuzzyTermsEnum()); @@ -176,15 +184,11 @@ public final class FuzzyTermsEnum extends TermsEnum { setEnum(newEnum); } } - // TODO, besides changing linear -> automaton, and swapping in a smaller - // automaton, we can also use this information to optimize the linear case - // itself: re-init maxDistances so the fast-fail happens for more terms due - // to the now stricter constraints. } // for some raw min similarity and input term length, the maximum # of edits private int initialMaxDistance(float minimumSimilarity, int termLen) { - return (int) ((1-minimumSimilarity) * termLen); + return (int) ((1D-minimumSimilarity) * termLen); } // for some number of edits, the maximum possible scaled boost @@ -442,7 +446,7 @@ public final class FuzzyTermsEnum extends TermsEnum { //which is 8-3 or more precisely Math.abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. - return 0.0f; + return Float.NEGATIVE_INFINITY; } // init matrix d @@ -473,7 +477,7 @@ public final class FuzzyTermsEnum extends TermsEnum { if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater //the closest the target can be to the text is just too far away. //this target is leaving the party early. - return 0.0f; + return Float.NEGATIVE_INFINITY; } // copy current distance counts to 'previous row' distance counts: swap p and d @@ -501,7 +505,8 @@ public final class FuzzyTermsEnum extends TermsEnum { * @return the maximum levenshtein distance that we care about */ private int calculateMaxDistance(int m) { - return (int) ((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)); + return raw ? maxEdits : Math.min(maxEdits, + (int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength))); } } } diff --git a/lucene/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java b/lucene/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java index 93c7c716a26..c6a3a3829af 100644 --- a/lucene/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java +++ b/lucene/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java @@ -87,7 +87,7 @@ public class TestMultiFieldQueryParser extends LuceneTestCase { assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString()); q = mfqp.parse("one~ two"); - assertEquals("(b:one~0.5 t:one~0.5) (b:two t:two)", q.toString()); + assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString()); q = mfqp.parse("one~0.8 two^2"); assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString()); @@ -274,7 +274,7 @@ public class TestMultiFieldQueryParser extends LuceneTestCase { q = parser.parse("bla*"); assertEquals("f1:bla* f2:bla* f3:bla*", q.toString()); q = parser.parse("bla~"); - assertEquals("f1:bla~0.5 f2:bla~0.5 f3:bla~0.5", q.toString()); + assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString()); q = parser.parse("[a TO c]"); assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString()); } diff --git a/lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java index 119e842252d..e8baa599326 100644 --- a/lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java +++ b/lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java @@ -431,10 +431,10 @@ public class TestQueryParser extends LuceneTestCase { public void testWildcard() throws Exception { assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*^2", null, "term*^2.0"); - assertQueryEquals("term~", null, "term~0.5"); + assertQueryEquals("term~", null, "term~2.0"); assertQueryEquals("term~0.7", null, "term~0.7"); - assertQueryEquals("term~^2", null, "term~0.5^2.0"); - assertQueryEquals("term^2~", null, "term~0.5^2.0"); + assertQueryEquals("term~^3", null, "term~2.0^3.0"); + assertQueryEquals("term^3~", null, "term~2.0^3.0"); assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ^3", null, "term*germ^3.0"); @@ -446,7 +446,7 @@ public class TestQueryParser extends LuceneTestCase { assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); fq = (FuzzyQuery)getQuery("term~", null); - assertEquals(0.5f, fq.getMinSimilarity(), 0.1f); + assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertParseException("term~1.1"); // value > 1, throws exception @@ -481,9 +481,9 @@ public class TestQueryParser extends LuceneTestCase { assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); // Fuzzy queries: - assertWildcardQueryEquals("Term~", "term~0.5"); - assertWildcardQueryEquals("Term~", true, "term~0.5"); - assertWildcardQueryEquals("Term~", false, "Term~0.5"); + assertWildcardQueryEquals("Term~", "term~2.0"); + assertWildcardQueryEquals("Term~", true, "term~2.0"); + assertWildcardQueryEquals("Term~", false, "Term~2.0"); // Range queries: assertWildcardQueryEquals("[A TO C]", "[a TO c]"); assertWildcardQueryEquals("[A TO C]", true, "[a TO c]"); @@ -761,10 +761,10 @@ public class TestQueryParser extends LuceneTestCase { assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); - assertQueryEquals("a:b\\-c~", a, "a:b-c~0.5"); - assertQueryEquals("a:b\\+c~", a, "a:b+c~0.5"); - assertQueryEquals("a:b\\:c~", a, "a:b:c~0.5"); - assertQueryEquals("a:b\\\\c~", a, "a:b\\c~0.5"); + assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); + assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); + assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); + assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]"); diff --git a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java index 08bc01b2c83..27180cd9238 100644 --- a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -202,58 +202,58 @@ public class TestFuzzyQuery extends LuceneTestCase { FuzzyQuery query; // not similar enough: - query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // edit distance to "aaaaaaa" = 3, this matches because the string is longer than // in testDefaultFuzziness so a bigger difference is allowed: - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); // now with prefix - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); - query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5); + query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // no match, more than half of the characters is wrong: - query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // now with prefix - query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // "student" and "stellent" are indeed similar to "segment" by default: - query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "student"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); // now with prefix - query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "student"), 0.5f, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "student"), 0.5f, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); @@ -328,7 +328,7 @@ public class TestFuzzyQuery extends LuceneTestCase { IndexSearcher searcher = new IndexSearcher(reader); writer.close(); - FuzzyQuery query = new FuzzyQuery(new Term("field", "Lucene")); + FuzzyQuery query = new FuzzyQuery(new Term("field", "lucene")); query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite()); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); @@ -378,6 +378,54 @@ public class TestFuzzyQuery extends LuceneTestCase { r.close(); index.close(); } + + public void testDistanceAsEditsParsing() throws Exception { + QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer()); + FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2"); + assertEquals(2f, q.getMinSimilarity(), 0.0001f); + } + + public void testDistanceAsEditsSearching() throws Exception { + Directory index = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, index); + addDoc("foobar", w); + addDoc("test", w); + addDoc("working", w); + IndexReader reader = w.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + w.close(); + QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer()); + + FuzzyQuery q = (FuzzyQuery) qp.parse("fouba~2"); + ScoreDoc[] hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); + + q = (FuzzyQuery) qp.parse("foubara~2"); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); + + q = (FuzzyQuery) qp.parse("t~3"); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + + q = new FuzzyQuery(new Term("field", "a"), 4f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + + q = new FuzzyQuery(new Term("field", "a"), 6f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(2, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); + + searcher.close(); + reader.close(); + index.close(); + } private void addDoc(String text, RandomIndexWriter writer) throws IOException { Document doc = new Document();