LUCENE-3527: Implement getDistance() on DirectSpellChecker.INTERNAL_LEVENSHTEIN

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1195081 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-10-30 00:36:32 +00:00
parent dcc51aebd9
commit f5ee5914b0
4 changed files with 138 additions and 4 deletions

View File

@ -52,6 +52,10 @@ New Features
* LUCENE-2507: Added DirectSpellChecker, which retrieves correction candidates directly
from the term dictionary using levenshtein automata. (Robert Muir)
* LUCENE-3527: Add LuceneLevenshteinDistance, which computes string distance in a compatible
way as DirectSpellChecker. This can be used to merge top-N results from more than one
SpellChecker. (James Dyer via Robert Muir)
API Changes
* LUCENE-2606: Changed RegexCapabilities interface to fix thread

View File

@ -70,10 +70,7 @@ public class DirectSpellChecker {
* shortest of the two terms instead of the longest.
* </ul>
*/
public static final StringDistance INTERNAL_LEVENSHTEIN = new StringDistance() {
public float getDistance(String s1, String s2) {
throw new UnsupportedOperationException("Not for external use.");
}};
public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
/** maximum edit distance for candidate terms */
private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;

View File

@ -0,0 +1,104 @@
package org.apache.lucene.search.spell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.IntsRef;
/**
* Levenshtein implemented in a consistent way as Lucene's FuzzyTermsEnum.
*
* Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
* <ul>
* <li> This metric treats full unicode codepoints as characters, but
* LevenshteinDistance calculates based on UTF-16 code units.
* <li> This metric scales raw edit distances into a floating point score
* differently than LevenshteinDistance: the scaling is based upon the
* shortest of the two terms instead of the longest.
* </ul>
*/
public final class LuceneLevenshteinDistance implements StringDistance {
@Override
public float getDistance(String target, String other) {
IntsRef targetPoints;
IntsRef otherPoints;
int n;
int p[]; //'previous' cost array, horizontally
int d[]; // cost array, horizontally
int _d[]; //placeholder to assist in swapping p and d
// cheaper to do this up front once
targetPoints = toIntsRef(target);
otherPoints = toIntsRef(other);
n = targetPoints.length;
p = new int[n+1];
d = new int[n+1];
final int m = otherPoints.length;
if (n == 0 || m == 0) {
if (n == m) {
return 1;
}
else {
return 0;
}
}
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
int t_j; // jth character of t
int cost; // cost
for (i = 0; i <= n; i++) {
p[i] = i;
}
for (j = 1; j <= m; j++) {
t_j = otherPoints.ints[j - 1];
d[0] = j;
for (i=1; i <= n; i++) {
cost = targetPoints.ints[i - 1] == t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return 1.0f - ((float) p[n] / Math.min(m, n));
}
private static IntsRef toIntsRef(String s) {
IntsRef ref = new IntsRef(s.length()); // worst case
int utf16Len = s.length();
for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
}
return ref;
}
}

View File

@ -30,6 +30,35 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestDirectSpellChecker extends LuceneTestCase {
public void testInternalLevenshteinDistance() throws Exception {
DirectSpellChecker spellchecker = new DirectSpellChecker();
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, dir,
new MockAnalyzer(random, MockTokenizer.KEYWORD, true));
String[] termsToAdd = { "metanoia", "metanoian", "metanoiai", "metanoias", "metanoi𐑍" };
for (int i = 0; i < termsToAdd.length; i++) {
Document doc = new Document();
doc.add(newField("repentance", termsToAdd[i], TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
String misspelled = "metanoix";
SuggestWord[] similar = spellchecker.suggestSimilar(new Term("repentance", misspelled), 4, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length == 4);
StringDistance sd = spellchecker.getDistance();
assertTrue(sd instanceof LuceneLevenshteinDistance);
for(SuggestWord word : similar) {
assertTrue(word.score==sd.getDistance(word.string, misspelled));
assertTrue(word.score==sd.getDistance(misspelled, word.string));
}
ir.close();
writer.close();
dir.close();
}
public void testSimpleExamples() throws Exception {
DirectSpellChecker spellChecker = new DirectSpellChecker();
spellChecker.setMinQueryLength(0);