mirror of https://github.com/apache/lucene.git
LUCENE-3527: Implement getDistance() on DirectSpellChecker.INTERNAL_LEVENSHTEIN
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1195081 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dcc51aebd9
commit
f5ee5914b0
|
@ -51,6 +51,10 @@ New Features
|
|||
|
||||
* LUCENE-2507: Added DirectSpellChecker, which retrieves correction candidates directly
|
||||
from the term dictionary using levenshtein automata. (Robert Muir)
|
||||
|
||||
* LUCENE-3527: Add LuceneLevenshteinDistance, which computes string distance in a compatible
|
||||
way as DirectSpellChecker. This can be used to merge top-N results from more than one
|
||||
SpellChecker. (James Dyer via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
|
|
|
@ -70,10 +70,7 @@ public class DirectSpellChecker {
|
|||
* shortest of the two terms instead of the longest.
|
||||
* </ul>
|
||||
*/
|
||||
public static final StringDistance INTERNAL_LEVENSHTEIN = new StringDistance() {
|
||||
public float getDistance(String s1, String s2) {
|
||||
throw new UnsupportedOperationException("Not for external use.");
|
||||
}};
|
||||
public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
|
||||
|
||||
/** maximum edit distance for candidate terms */
|
||||
private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/**
|
||||
* Levenshtein implemented in a consistent way as Lucene's FuzzyTermsEnum.
|
||||
*
|
||||
* Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
|
||||
* <ul>
|
||||
* <li> This metric treats full unicode codepoints as characters, but
|
||||
* LevenshteinDistance calculates based on UTF-16 code units.
|
||||
* <li> This metric scales raw edit distances into a floating point score
|
||||
* differently than LevenshteinDistance: the scaling is based upon the
|
||||
* shortest of the two terms instead of the longest.
|
||||
* </ul>
|
||||
*/
|
||||
public final class LuceneLevenshteinDistance implements StringDistance {
|
||||
|
||||
@Override
|
||||
public float getDistance(String target, String other) {
|
||||
IntsRef targetPoints;
|
||||
IntsRef otherPoints;
|
||||
int n;
|
||||
int p[]; //'previous' cost array, horizontally
|
||||
int d[]; // cost array, horizontally
|
||||
int _d[]; //placeholder to assist in swapping p and d
|
||||
|
||||
// cheaper to do this up front once
|
||||
targetPoints = toIntsRef(target);
|
||||
otherPoints = toIntsRef(other);
|
||||
n = targetPoints.length;
|
||||
p = new int[n+1];
|
||||
d = new int[n+1];
|
||||
|
||||
final int m = otherPoints.length;
|
||||
if (n == 0 || m == 0) {
|
||||
if (n == m) {
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// indexes into strings s and t
|
||||
int i; // iterates through s
|
||||
int j; // iterates through t
|
||||
|
||||
int t_j; // jth character of t
|
||||
|
||||
int cost; // cost
|
||||
|
||||
for (i = 0; i <= n; i++) {
|
||||
p[i] = i;
|
||||
}
|
||||
|
||||
for (j = 1; j <= m; j++) {
|
||||
t_j = otherPoints.ints[j - 1];
|
||||
d[0] = j;
|
||||
|
||||
for (i=1; i <= n; i++) {
|
||||
cost = targetPoints.ints[i - 1] == t_j ? 0 : 1;
|
||||
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
||||
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
|
||||
}
|
||||
|
||||
// copy current distance counts to 'previous row' distance counts
|
||||
_d = p;
|
||||
p = d;
|
||||
d = _d;
|
||||
}
|
||||
|
||||
// our last action in the above loop was to switch d and p, so p now
|
||||
// actually has the most recent cost counts
|
||||
return 1.0f - ((float) p[n] / Math.min(m, n));
|
||||
}
|
||||
|
||||
private static IntsRef toIntsRef(String s) {
|
||||
IntsRef ref = new IntsRef(s.length()); // worst case
|
||||
int utf16Len = s.length();
|
||||
for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
|
||||
cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
|
||||
}
|
||||
return ref;
|
||||
}
|
||||
}
|
|
@ -29,7 +29,36 @@ import org.apache.lucene.util.English;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestDirectSpellChecker extends LuceneTestCase {
|
||||
|
||||
public void testInternalLevenshteinDistance() throws Exception {
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, dir,
|
||||
new MockAnalyzer(random, MockTokenizer.KEYWORD, true));
|
||||
|
||||
String[] termsToAdd = { "metanoia", "metanoian", "metanoiai", "metanoias", "metanoi𐑍" };
|
||||
for (int i = 0; i < termsToAdd.length; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("repentance", termsToAdd[i], TextField.TYPE_UNSTORED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader ir = writer.getReader();
|
||||
String misspelled = "metanoix";
|
||||
SuggestWord[] similar = spellchecker.suggestSimilar(new Term("repentance", misspelled), 4, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||
assertTrue(similar.length == 4);
|
||||
|
||||
StringDistance sd = spellchecker.getDistance();
|
||||
assertTrue(sd instanceof LuceneLevenshteinDistance);
|
||||
for(SuggestWord word : similar) {
|
||||
assertTrue(word.score==sd.getDistance(word.string, misspelled));
|
||||
assertTrue(word.score==sd.getDistance(misspelled, word.string));
|
||||
}
|
||||
|
||||
ir.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
public void testSimpleExamples() throws Exception {
|
||||
DirectSpellChecker spellChecker = new DirectSpellChecker();
|
||||
spellChecker.setMinQueryLength(0);
|
||||
|
|
Loading…
Reference in New Issue