LUCENE-1224: Short circuit FuzzyQuery.rewrite when input token length is small compared to minSimilarity.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@735517 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-01-18 18:24:14 +00:00
parent 8c58de6092
commit 8bf2fda5bd
3 changed files with 45 additions and 1 deletions

View File

@ -1,4 +1,4 @@
Lucene Change Log Lucene Change Log
$Id$ $Id$
======================= Trunk (not yet released) ======================= ======================= Trunk (not yet released) =======================
@ -141,6 +141,9 @@ Optimizations
3. LUCENE-1484: Remove synchronization of IndexReader.document() by 3. LUCENE-1484: Remove synchronization of IndexReader.document() by
using CloseableThreadLocal internally. (Jason Rutherglen via Mike using CloseableThreadLocal internally. (Jason Rutherglen via Mike
McCandless). McCandless).
4. LUCENE-1224: Short circuit FuzzyQuery.rewrite when input token length
is small compared to minSimilarity. (Timo Nentwig, Mark Miller)
Documentation Documentation

View File

@ -34,6 +34,7 @@ public class FuzzyQuery extends MultiTermQuery {
private float minimumSimilarity; private float minimumSimilarity;
private int prefixLength; private int prefixLength;
private boolean termLongEnough = false;
/** /**
* Create a new FuzzyQuery that will match terms with a similarity * Create a new FuzzyQuery that will match terms with a similarity
@ -61,6 +62,10 @@ public class FuzzyQuery extends MultiTermQuery {
if (prefixLength < 0) if (prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0"); throw new IllegalArgumentException("prefixLength < 0");
if (term.text().length() > 1.0f / (1.0f - minimumSimilarity)) {
this.termLongEnough = true;
}
this.minimumSimilarity = minimumSimilarity; this.minimumSimilarity = minimumSimilarity;
this.prefixLength = prefixLength; this.prefixLength = prefixLength;
} }
@ -105,6 +110,10 @@ public class FuzzyQuery extends MultiTermQuery {
} }
public Query rewrite(IndexReader reader) throws IOException { public Query rewrite(IndexReader reader) throws IOException {
if(!termLongEnough) { // can't match
return new BooleanQuery();
}
FilteredTermEnum enumerator = getEnum(reader); FilteredTermEnum enumerator = getEnum(reader);
int maxClauseCount = BooleanQuery.getMaxClauseCount(); int maxClauseCount = BooleanQuery.getMaxClauseCount();
ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);

View File

@ -249,6 +249,38 @@ public class TestFuzzyQuery extends LuceneTestCase {
directory.close(); directory.close();
} }
public void testTokenLengthOpt() throws IOException {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
true, IndexWriter.MaxFieldLength.LIMITED);
addDoc("12345678911", writer);
addDoc("segment", writer);
writer.optimize();
writer.close();
IndexSearcher searcher = new IndexSearcher(directory);
Query query;
// term not over 10 chars, so optimization shortcuts
query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// 10 chars, so no optimization
query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// over 10 chars, so no optimization
query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// over 10 chars, no match
query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
}
private void addDoc(String text, IndexWriter writer) throws IOException { private void addDoc(String text, IndexWriter writer) throws IOException {
Document doc = new Document(); Document doc = new Document();
doc.add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED));