mirror of https://github.com/apache/lucene.git
You can now set the required fuzziness of FuzzyQuery. Note that QueryParser does not support this (yet?).
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150422 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c65cb9a931
commit
cd2ca90309
|
@ -20,14 +20,43 @@ import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/** Implements the fuzzy search query */
|
/** Implements the fuzzy search query. The similiarity measurement
|
||||||
|
* is based on the Levenshtein (edit distance) algorithm.
|
||||||
|
*/
|
||||||
public final class FuzzyQuery extends MultiTermQuery {
|
public final class FuzzyQuery extends MultiTermQuery {
|
||||||
public FuzzyQuery(Term term) {
|
|
||||||
|
private float minimumSimilarity;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new FuzzyQuery that will match terms with a similarity
|
||||||
|
* of at least <code>minimumSimilarity</code> to <code>term</code>.
|
||||||
|
*
|
||||||
|
* @param term the term to search for
|
||||||
|
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
|
||||||
|
* between the query term and the matching terms. For example, for a
|
||||||
|
* <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
|
||||||
|
* as the query term is considered similar to the query term if the edit distance
|
||||||
|
* between both terms is less than <code>length(term)*0.5</code>.
|
||||||
|
* @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0
|
||||||
|
*/
|
||||||
|
public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
|
||||||
super(term);
|
super(term);
|
||||||
|
if (minimumSimilarity > 1.0f)
|
||||||
|
throw new IllegalArgumentException("minimumSimilarity > 1");
|
||||||
|
else if (minimumSimilarity < 0.0f)
|
||||||
|
throw new IllegalArgumentException("minimumSimilarity < 0");
|
||||||
|
this.minimumSimilarity = minimumSimilarity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}.
|
||||||
|
*/
|
||||||
|
public FuzzyQuery(Term term) {
|
||||||
|
this(term, 0.5f);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
|
||||||
return new FuzzyTermEnum(reader, getTerm());
|
return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString(String field) {
|
public String toString(String field) {
|
||||||
|
|
|
@ -26,16 +26,24 @@ import org.apache.lucene.index.Term;
|
||||||
the enumeration is greater than all that precede it. */
|
the enumeration is greater than all that precede it. */
|
||||||
public final class FuzzyTermEnum extends FilteredTermEnum {
|
public final class FuzzyTermEnum extends FilteredTermEnum {
|
||||||
double distance;
|
double distance;
|
||||||
boolean fieldMatch = false;
|
|
||||||
boolean endEnum = false;
|
boolean endEnum = false;
|
||||||
|
|
||||||
Term searchTerm = null;
|
Term searchTerm = null;
|
||||||
String field = "";
|
String field = "";
|
||||||
String text = "";
|
String text = "";
|
||||||
int textlen;
|
int textlen;
|
||||||
|
float minimumSimilarity;
|
||||||
|
double scale_factor;
|
||||||
|
|
||||||
|
|
||||||
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
|
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
|
||||||
|
this(reader, term, 0.5f);
|
||||||
|
}
|
||||||
|
|
||||||
|
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
|
||||||
super();
|
super();
|
||||||
|
minimumSimilarity = minSimilarity;
|
||||||
|
scale_factor = 1.0f / (1.0f - minimumSimilarity);
|
||||||
searchTerm = term;
|
searchTerm = term;
|
||||||
field = searchTerm.field();
|
field = searchTerm.field();
|
||||||
text = searchTerm.text();
|
text = searchTerm.text();
|
||||||
|
@ -53,14 +61,14 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
|
||||||
int targetlen = target.length();
|
int targetlen = target.length();
|
||||||
int dist = editDistance(text, target, textlen, targetlen);
|
int dist = editDistance(text, target, textlen, targetlen);
|
||||||
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
|
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
|
||||||
return (distance > FUZZY_THRESHOLD);
|
return (distance > minimumSimilarity);
|
||||||
}
|
}
|
||||||
endEnum = true;
|
endEnum = true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final float difference() {
|
protected final float difference() {
|
||||||
return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
|
return (float)((distance - minimumSimilarity) * scale_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final boolean endEnum() {
|
public final boolean endEnum() {
|
||||||
|
@ -71,9 +79,6 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
|
||||||
* Compute Levenshtein distance
|
* Compute Levenshtein distance
|
||||||
******************************/
|
******************************/
|
||||||
|
|
||||||
public static final double FUZZY_THRESHOLD = 0.5;
|
|
||||||
public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Finds and returns the smallest of three integers
|
Finds and returns the smallest of three integers
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.store.RAMDirectory;
|
||||||
*/
|
*/
|
||||||
public class TestFuzzyQuery extends TestCase {
|
public class TestFuzzyQuery extends TestCase {
|
||||||
|
|
||||||
public void testDefaultFuzziness() throws Exception {
|
public void testFuzziness() throws Exception {
|
||||||
RAMDirectory directory = new RAMDirectory();
|
RAMDirectory directory = new RAMDirectory();
|
||||||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
||||||
addDoc("aaaaa", writer);
|
addDoc("aaaaa", writer);
|
||||||
|
@ -90,7 +90,7 @@ public class TestFuzzyQuery extends TestCase {
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDefaultFuzzinessLong() throws Exception {
|
public void testFuzzinessLong() throws Exception {
|
||||||
RAMDirectory directory = new RAMDirectory();
|
RAMDirectory directory = new RAMDirectory();
|
||||||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
||||||
addDoc("aaaaaaa", writer);
|
addDoc("aaaaaaa", writer);
|
||||||
|
@ -124,6 +124,24 @@ public class TestFuzzyQuery extends TestCase {
|
||||||
hits = searcher.search(query);
|
hits = searcher.search(query);
|
||||||
assertEquals(1, hits.length());
|
assertEquals(1, hits.length());
|
||||||
|
|
||||||
|
// "student" doesn't match anymore thanks to increased minimum similarity:
|
||||||
|
query = new FuzzyQuery(new Term("field", "student"), 0.6f);
|
||||||
|
hits = searcher.search(query);
|
||||||
|
assertEquals(0, hits.length());
|
||||||
|
|
||||||
|
try {
|
||||||
|
query = new FuzzyQuery(new Term("field", "student"), 1.1f);
|
||||||
|
fail("Expected IllegalArgumentException");
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// expecting exception
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
query = new FuzzyQuery(new Term("field", "student"), -0.1f);
|
||||||
|
fail("Expected IllegalArgumentException");
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// expecting exception
|
||||||
|
}
|
||||||
|
|
||||||
searcher.close();
|
searcher.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue