mirror of https://github.com/apache/lucene.git
You can now set the required fuzziness of FuzzyQuery. Note that QueryParser does not support this (yet?).
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150422 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c65cb9a931
commit
cd2ca90309
|
@ -20,14 +20,43 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.Term;
|
||||
import java.io.IOException;
|
||||
|
||||
/** Implements the fuzzy search query */
|
||||
/** Implements the fuzzy search query. The similiarity measurement
|
||||
* is based on the Levenshtein (edit distance) algorithm.
|
||||
*/
|
||||
public final class FuzzyQuery extends MultiTermQuery {
|
||||
public FuzzyQuery(Term term) {
|
||||
|
||||
private float minimumSimilarity;
|
||||
|
||||
/**
|
||||
* Create a new FuzzyQuery that will match terms with a similarity
|
||||
* of at least <code>minimumSimilarity</code> to <code>term</code>.
|
||||
*
|
||||
* @param term the term to search for
|
||||
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
|
||||
* between the query term and the matching terms. For example, for a
|
||||
* <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
|
||||
* as the query term is considered similar to the query term if the edit distance
|
||||
* between both terms is less than <code>length(term)*0.5</code>.
|
||||
* @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0
|
||||
*/
|
||||
public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
|
||||
super(term);
|
||||
if (minimumSimilarity > 1.0f)
|
||||
throw new IllegalArgumentException("minimumSimilarity > 1");
|
||||
else if (minimumSimilarity < 0.0f)
|
||||
throw new IllegalArgumentException("minimumSimilarity < 0");
|
||||
this.minimumSimilarity = minimumSimilarity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}.
|
||||
*/
|
||||
public FuzzyQuery(Term term) {
|
||||
this(term, 0.5f);
|
||||
}
|
||||
|
||||
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
||||
return new FuzzyTermEnum(reader, getTerm());
|
||||
return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity);
|
||||
}
|
||||
|
||||
public String toString(String field) {
|
||||
|
|
|
@ -26,16 +26,24 @@ import org.apache.lucene.index.Term;
|
|||
the enumeration is greater than all that precede it. */
|
||||
public final class FuzzyTermEnum extends FilteredTermEnum {
|
||||
double distance;
|
||||
boolean fieldMatch = false;
|
||||
boolean endEnum = false;
|
||||
|
||||
Term searchTerm = null;
|
||||
String field = "";
|
||||
String text = "";
|
||||
int textlen;
|
||||
float minimumSimilarity;
|
||||
double scale_factor;
|
||||
|
||||
|
||||
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
|
||||
this(reader, term, 0.5f);
|
||||
}
|
||||
|
||||
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
|
||||
super();
|
||||
minimumSimilarity = minSimilarity;
|
||||
scale_factor = 1.0f / (1.0f - minimumSimilarity);
|
||||
searchTerm = term;
|
||||
field = searchTerm.field();
|
||||
text = searchTerm.text();
|
||||
|
@ -53,14 +61,14 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
|
|||
int targetlen = target.length();
|
||||
int dist = editDistance(text, target, textlen, targetlen);
|
||||
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
|
||||
return (distance > FUZZY_THRESHOLD);
|
||||
return (distance > minimumSimilarity);
|
||||
}
|
||||
endEnum = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
protected final float difference() {
|
||||
return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
|
||||
return (float)((distance - minimumSimilarity) * scale_factor);
|
||||
}
|
||||
|
||||
public final boolean endEnum() {
|
||||
|
@ -71,9 +79,6 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
|
|||
* Compute Levenshtein distance
|
||||
******************************/
|
||||
|
||||
public static final double FUZZY_THRESHOLD = 0.5;
|
||||
public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
|
||||
|
||||
/**
|
||||
Finds and returns the smallest of three integers
|
||||
*/
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.store.RAMDirectory;
|
|||
*/
|
||||
public class TestFuzzyQuery extends TestCase {
|
||||
|
||||
public void testDefaultFuzziness() throws Exception {
|
||||
public void testFuzziness() throws Exception {
|
||||
RAMDirectory directory = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
||||
addDoc("aaaaa", writer);
|
||||
|
@ -90,7 +90,7 @@ public class TestFuzzyQuery extends TestCase {
|
|||
directory.close();
|
||||
}
|
||||
|
||||
public void testDefaultFuzzinessLong() throws Exception {
|
||||
public void testFuzzinessLong() throws Exception {
|
||||
RAMDirectory directory = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
||||
addDoc("aaaaaaa", writer);
|
||||
|
@ -124,6 +124,24 @@ public class TestFuzzyQuery extends TestCase {
|
|||
hits = searcher.search(query);
|
||||
assertEquals(1, hits.length());
|
||||
|
||||
// "student" doesn't match anymore thanks to increased minimum similarity:
|
||||
query = new FuzzyQuery(new Term("field", "student"), 0.6f);
|
||||
hits = searcher.search(query);
|
||||
assertEquals(0, hits.length());
|
||||
|
||||
try {
|
||||
query = new FuzzyQuery(new Term("field", "student"), 1.1f);
|
||||
fail("Expected IllegalArgumentException");
|
||||
} catch (IllegalArgumentException e) {
|
||||
// expecting exception
|
||||
}
|
||||
try {
|
||||
query = new FuzzyQuery(new Term("field", "student"), -0.1f);
|
||||
fail("Expected IllegalArgumentException");
|
||||
} catch (IllegalArgumentException e) {
|
||||
// expecting exception
|
||||
}
|
||||
|
||||
searcher.close();
|
||||
directory.close();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue