LUCENE-3662: extend LevenshteinAutomata to support transpositions as primitive a edit

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1224817 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-12-26 21:18:36 +00:00
parent 17438c9eac
commit b096910a82
16 changed files with 716 additions and 146 deletions

View File

@ -396,6 +396,10 @@ New features
and FuzzyQuery with finite-state methods. Adds RegexpQuery.
(Robert Muir, Mike McCandless, Uwe Schindler, Mark Miller)
* LUCENE-3662: Add support for levenshtein distance with transpositions
to LevenshteinAutomata, FuzzyTermsEnum, and DirectSpellChecker.
(Jean-Philippe Barrette-LaPierre, Robert Muir)
* LUCENE-2321: Cutover to a more RAM efficient packed-ints based
representation for the in-memory terms dict index. (Mike
McCandless)

View File

@ -478,10 +478,14 @@
<macrodef name="createLevAutomaton">
<attribute name="n"/>
<sequential>
<exec dir="src/java/org/apache/lucene/util/automaton"
<exec dir="src/java/org/apache/lucene/util/automaton"
executable="${python.exe}" failonerror="true">
<arg line="createLevAutomata.py @{n}"/>
<arg line="createLevAutomata.py @{n} True"/>
</exec>
<exec dir="src/java/org/apache/lucene/util/automaton"
executable="${python.exe}" failonerror="true">
<arg line="createLevAutomata.py @{n} False"/>
</exec>
</sequential>
</macrodef>

View File

@ -194,7 +194,7 @@
<property name="hg.exe" value="hg" />
<property name="moman.url" value="https://bitbucket.org/jpbarrette/moman" />
<property name="moman.rev" value="115" />
<property name="moman.rev" value="120" />
<property name="python.exe" value="python" />
<property name="gpg.exe" value="gpg" />

View File

@ -211,7 +211,7 @@ public class FuzzyLikeThisQuery extends Query
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength);
FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength, false);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants=0;

View File

@ -141,7 +141,10 @@ public class FuzzyQuery extends MultiTermQuery {
if (!termLongEnough) { // can only match if it's exact
return new SingleTermsEnum(terms.iterator(null), term.bytes());
}
return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength);
// TODO: should we expose the transpositions option to this query?
// maybe move the old/slowish stuff (lev without transpositions, n > 2, etc) all to contrib,
// deprecate it, and just have a faster/simpler/better one in core?
return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength, false);
}
/**

View File

@ -80,6 +80,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
private final int termText[];
private final int realPrefixLength;
private final boolean transpositions;
/**
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
@ -98,7 +100,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
* @throws IOException
*/
public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
final float minSimilarity, final int prefixLength) throws IOException {
final float minSimilarity, final int prefixLength, boolean transpositions) throws IOException {
if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
throw new IllegalArgumentException("fractional edit distances are not allowed");
if (minSimilarity < 0.0f)
@ -130,6 +132,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
maxEdits = initialMaxDistance(this.minSimilarity, termLength);
raw = false;
}
if (transpositions && maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new UnsupportedOperationException("with transpositions enabled, distances > "
+ LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported ");
}
this.transpositions = transpositions;
this.scale_factor = 1.0f / (1.0f - this.minSimilarity);
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
@ -162,7 +169,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
if (runAutomata.size() <= maxDistance &&
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
LevenshteinAutomata builder =
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength));
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);
for (int i = runAutomata.size(); i <= maxDistance; i++) {
Automaton a = builder.toAutomaton(i);

View File

@ -89,26 +89,26 @@ class Lev1ParametricDescription extends ParametricDescription {
// 4 vectors; 5 states per vector; array length = 20
private final static long[] toStates2 = new long[] /*3 bits per value */ {
0x4da292442420003L
0x69a292450428003L
};
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
0x5555528000L
0x5555588000L
};
// 8 vectors; 5 states per vector; array length = 40
private final static long[] toStates3 = new long[] /*3 bits per value */ {
0x14d0812112018003L,0xb1a29b46d48a49L
0x1690a82152018003L,0xb1a2d346448a49L
};
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
0x555555e80a0f0000L,0x5555L
0x555555b8220f0000L,0x5555L
};
// state map
// 0 -> [(0, 0)]
// 1 -> [(0, 1)]
// 2 -> [(0, 1), (1, 1)]
// 3 -> [(0, 1), (1, 1), (2, 1)]
// 4 -> [(0, 1), (2, 1)]
// 3 -> [(0, 1), (2, 1)]
// 4 -> [(0, 1), (1, 1), (2, 1)]
public Lev1ParametricDescription(int w) {

View File

@ -0,0 +1,119 @@
package org.apache.lucene.util.automaton;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// The following code was generated with the moman/finenight pkg
// This package is available under the MIT License, see NOTICE.txt
// for more details.
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
/** Parametric description for generating a Levenshtein automaton of degree 1,
with transpositions as primitive edits */
class Lev1TParametricDescription extends ParametricDescription {
@Override
int transition(int absState, int position, int vector) {
// null absState should never be passed in
assert absState != -1;
// decode absState -> state, offset
int state = absState/(w+1);
int offset = absState%(w+1);
assert offset >= 0;
if (position == w) {
if (state < 2) {
final int loc = vector * 2 + state;
offset += unpack(offsetIncrs0, loc, 1);
state = unpack(toStates0, loc, 2)-1;
}
} else if (position == w-1) {
if (state < 3) {
final int loc = vector * 3 + state;
offset += unpack(offsetIncrs1, loc, 1);
state = unpack(toStates1, loc, 2)-1;
}
} else if (position == w-2) {
if (state < 6) {
final int loc = vector * 6 + state;
offset += unpack(offsetIncrs2, loc, 2);
state = unpack(toStates2, loc, 3)-1;
}
} else {
if (state < 6) {
final int loc = vector * 6 + state;
offset += unpack(offsetIncrs3, loc, 2);
state = unpack(toStates3, loc, 3)-1;
}
}
if (state == -1) {
// null state
return -1;
} else {
// translate back to abs
return state*(w+1)+offset;
}
}
// 1 vectors; 2 states per vector; array length = 2
private final static long[] toStates0 = new long[] /*2 bits per value */ {
0x2L
};
private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ {
0x0L
};
// 2 vectors; 3 states per vector; array length = 6
private final static long[] toStates1 = new long[] /*2 bits per value */ {
0xa43L
};
private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
0x38L
};
// 4 vectors; 6 states per vector; array length = 24
private final static long[] toStates2 = new long[] /*3 bits per value */ {
0x3453491482140003L,0x6dL
};
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
0x555555a20000L
};
// 8 vectors; 6 states per vector; array length = 48
private final static long[] toStates3 = new long[] /*3 bits per value */ {
0x21520854900c0003L,0x5b4d19a24534916dL,0xda34L
};
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
0x5555ae0a20fc0000L,0x55555555L
};
// state map
// 0 -> [(0, 0)]
// 1 -> [(0, 1)]
// 2 -> [(0, 1), (1, 1)]
// 3 -> [(0, 1), (2, 1)]
// 4 -> [t(0, 1), (0, 1), (1, 1), (2, 1)]
// 5 -> [(0, 1), (1, 1), (2, 1)]
public Lev1TParametricDescription(int w) {
super(w, 1, new int[] {0,1,0,-1,-1,-1});
}
}

View File

@ -93,7 +93,7 @@ class Lev2ParametricDescription extends ParametricDescription {
// 2 vectors; 5 states per vector; array length = 10
private final static long[] toStates1 = new long[] /*3 bits per value */ {
0x1a68c105L
0x13688b44L
};
private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
0x3e0L
@ -101,41 +101,41 @@ class Lev2ParametricDescription extends ParametricDescription {
// 4 vectors; 11 states per vector; array length = 44
private final static long[] toStates2 = new long[] /*4 bits per value */ {
0x6280b80804280405L,0x2323432321608282L,0x523434543213L
0x26a09a0a0520a504L,0x2323523321a260a2L,0x354235543213L
};
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
0x5555502220000800L,0x555555L
0x5555520280000800L,0x555555L
};
// 8 vectors; 21 states per vector; array length = 168
private final static long[] toStates3 = new long[] /*5 bits per value */ {
0x40300c0108801005L,0x80202a8208801000L,0x4021006280a0288dL,0x30482184802d8414L,
0x5990240880010460L,0x191a28118330900L,0x310c413204c1104L,0x8625084811c4710dL,
0xa92a398e2188231aL,0x104e351c4a508ca4L,0x21208511c8341483L,0xe6290620946a1910L,
0xd47221423216a4a0L,0x28L
0x380e014a051404L,0xe28245009451140L,0x8a26880098a6268cL,0x180a288ca0246213L,
0x494053284a1080e1L,0x510265a89c311940L,0x4218c41188a6509cL,0x6340c4211c4710dL,
0xa168398471882a12L,0x104c841c683a0425L,0x3294472904351483L,0xe6290620a84a20d0L,
0x1441a0ea2896a4a0L,0x32L
};
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
0x33300030c2000800L,0x32828088800c3cfL,0x5555550cace32320L,0x5555555555555555L,
0x33300230c0000800L,0x220ca080a00fc330L,0x555555f832823380L,0x5555555555555555L,
0x5555555555555555L,0x5555L
};
// 16 vectors; 30 states per vector; array length = 480
private final static long[] toStates4 = new long[] /*5 bits per value */ {
0x80300c0108801005L,0x88210802000L,0x44200401400000L,0x7ae3b88621185c07L,
0x101500042100404L,0x20803140501446cL,0x40100420006c2122L,0x490140511b004054L,
0x8401f2e3c086411L,0x120861200b100822L,0x641102400081180cL,0x4802c40100001088L,
0x8c21195607048418L,0x1421014245bc3f2L,0x23450230661200b1L,0x2108664118240803L,
0x8c1984802c802004L,0xbc3e28c41150d140L,0xc4120102209421dL,0x7884c11c4710d031L,
0x210842109031bc62L,0xd21484360c431044L,0x9c265293a3a6e741L,0x1cc710c41109ce70L,
0x1bce27a846525495L,0x3105425094a108c7L,0x6f735e95254731c4L,0x9ee7a9c234a9393aL,
0x144720d0520c4150L,0x211051bc646084c2L,0x3614831048220842L,0x93a460e742351488L,
0xc4120a2e70a24656L,0x284642d4941cc520L,0x4094a210c51bce46L,0xb525073148310502L,
0x24356939460f7358L,0x4098e7aaL
0x380e014a051404L,0xaa015452940L,0x55014501000000L,0x1843ddc771085c07L,
0x7141200040108405L,0x52b44004c5313460L,0x401080200063115cL,0x85314c4d181c5048L,
0x1440190a3e5c7828L,0x28a232809100a21L,0xa028ca2a84203846L,0xca0240010800108aL,
0xc7b4205c1580a508L,0x1021090251846b6L,0x4cb513862328090L,0x210863128ca2b8a2L,
0x4e188ca024402940L,0xa6b6c7c520532d4L,0x8c41101451150219L,0xa0c4211c4710d421L,
0x2108421094e15063L,0x8f13c43708631044L,0x18274d908c611631L,0x1cc238c411098263L,
0x450e3a1d0212d0b4L,0x31050242048108c6L,0xfa318b42d07308eL,0xa8865182356907c6L,
0x1ca410d4520c4140L,0x2954e13883a0ca51L,0x3714831044229442L,0x93946116b58f2c84L,
0xc41109a5631a574dL,0x1d4512d4941cc520L,0x52848294c643883aL,0xb525073148310502L,
0xa5356939460f7358L,0x409ca651L
};
private final static long[] offsetIncrs4 = new long[] /*3 bits per value */ {
0xc0602000010000L,0xa000040000000001L,0x248204041248L,0xb0180c06c3618618L,
0x238d861860001861L,0x41040061c6e06041L,0x4004900c2402400L,0x409489001041001L,
0x4184184004148124L,0x1041b4980c24c3L,0xd26040938d061061L,0x2492492492494146L,
0x20c0600000010000L,0x2000040000000001L,0x209204a40209L,0x301b6c0618018618L,
0x207206186000186cL,0x1200061b8e06dc0L,0x480492080612010L,0xa20204a040048000L,
0x1061a0000129124L,0x1848349b680612L,0xd26da0204a041868L,0x2492492492496128L,
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
0x2492492492492492L,0x9249249249249249L,0x24924924L
@ -143,33 +143,33 @@ class Lev2ParametricDescription extends ParametricDescription {
// 32 vectors; 30 states per vector; array length = 960
private final static long[] toStates5 = new long[] /*5 bits per value */ {
0x80300c0108801005L,0x88210802000L,0x42200401400000L,0xa088201000300c03L,
0x100510842108428L,0x2188461701c01108L,0x108401011eb8eeL,0x85c0700442004014L,
0x88267ae3b886211L,0x1446c01015108842L,0xc212202080314050L,0x405440100420006L,
0x10201c50140511b0L,0x942528423b08888L,0x240501446c010155L,0x21007cb8f0219045L,
0x511b004054402088L,0x2e3c086411490140L,0x200b50904428823fL,0x400081180c120861L,
0x100001088641102L,0x46030482184802c4L,0x9ce8990840980030L,0x21061200b709c210L,
0xf0fca308465581c1L,0x802c405084050916L,0xc211956070484184L,0x9e4209ee65bc3f28L,
0x3450230661200b70L,0x1086641182408032L,0xc1984802c8020042L,0x86098201c8d1408L,
0xb88a22529ce399L,0x1045434502306612L,0x4088250876f0f8a3L,0xd1408c1984802c80L,
0xee3dbc3e28c41150L,0xd0310c4188984429L,0xbc627884c11c4710L,0x1044210842109031L,
0x21704711c4340c43L,0xbdef7bdf0c7a18b4L,0x85210d8310c41ef7L,0x994a4e8e9b9d074L,
0x60c4310442739c27L,0x3a3a6e741d214843L,0x41ef77bdf77de529L,0x8465254951cc710cL,
0x94a108c71bce27aL,0x5254731c43105425L,0xdb1c7a38b4a15949L,0xc710c41cf73dce7bL,
0xe4e9bdcd7a54951cL,0x5427b9ea708d2a4L,0x735e95254731c431L,0xbd677db4a9393a6fL,
0x4720d0520c41cf75L,0x1051bc646084c214L,0x1483104822084221L,0x193821708511c834L,
0x1bf6fdef6f7f147aL,0xd08d45220d8520c4L,0x9c289195a4e91839L,0x488361483104828bL,
0xe5693a460e742351L,0x520c41bf71bdf717L,0xe46284642d4941ccL,0x5024094a210c51bcL,
0x590b525073148310L,0xce6f7b147a3938a1L,0x941cc520c41f77ddL,0xd5a4e5183dcd62d4L,
0x48310502639ea890L,0x460f7358b5250731L,0xf779bd6717b56939L
0x380e014a051404L,0xaa015452940L,0x8052814501000000L,0xb80a515450000e03L,
0x5140410842108426L,0x71dc421701c01540L,0x100421014610f7L,0x85c0700550145010L,
0x94a271843ddc7710L,0x1346071412108a22L,0x3115c52b44004c53L,0xc504840108020006L,
0x54d1001314c4d181L,0x9081204239c4a71L,0x14c5313460714124L,0x51006428f971e0a2L,
0x4d181c5048402884L,0xa3e5c782885314cL,0x2809409482a8a239L,0x2a84203846028a23L,
0x10800108aa028caL,0xe1180a288ca0240L,0x98c6b80e3294a108L,0x2942328091098c10L,
0x11adb1ed08170560L,0xa024004084240946L,0x7b4205c1580a508cL,0xa8c2968c71846b6cL,
0x4cb5138623280910L,0x10863128ca2b8a20L,0xe188ca0244029402L,0x4e3294e288132d44L,
0x809409ad1218c39cL,0xf14814cb51386232L,0x514454086429adb1L,0x32d44e188ca02440L,
0x8c390a6b6c7c5205L,0xd4218c41409cd2aaL,0x5063a0c4211c4710L,0x10442108421094e1L,
0x31084711c4350863L,0xbdef7bddf05918f2L,0xc4f10dc218c41ef7L,0x9d3642318458c63L,
0x70863104426098c6L,0x8c6116318f13c43L,0x41ef75dd6b5de4d9L,0xd0212d0b41cc238cL,
0x2048108c6450e3a1L,0x42d07308e3105024L,0xdb591938f274084bL,0xc238c41f77deefbbL,
0x1f183e8c62d0b41cL,0x502a2194608d5a4L,0xa318b42d07308e31L,0xed675db56907c60fL,
0xa410d4520c41f773L,0x54e13883a0ca511cL,0x1483104422944229L,0x20f2329447290435L,
0x1ef6f7ef6f7df05cL,0xad63cb210dc520c4L,0x58c695d364e51845L,0xc843714831044269L,
0xe4d93946116b58f2L,0x520c41ef717d6b17L,0x83a1d4512d4941ccL,0x50252848294c6438L,
0x144b525073148310L,0xefaf7b591c20f275L,0x941cc520c41f777bL,0xd5a4e5183dcd62d4L,
0x4831050272994694L,0x460f7358b5250731L,0xf779bd6717b56939L
};
private final static long[] offsetIncrs5 = new long[] /*3 bits per value */ {
0xc0602000010000L,0x8000040000000001L,0xb6db6d4030180L,0x810104922800010L,
0x248a000040000092L,0x618000b649654041L,0x861b0180c06c3618L,0x301b0d861860001L,
0x61861800075d6ed6L,0x1871b8181048e3L,0xe56041238d861860L,0x40240041040075c6L,
0x4100104004900c2L,0x55b5240309009001L,0x1025224004104005L,0x10410010520490L,
0x55495240409489L,0x4980c24c34184184L,0x30d061061001041bL,0x184005556d260309L,
0x51b4981024e34184L,0x40938d0610610010L,0x492492495546d260L,0x2492492492492492L,
0x20c0600000010000L,0x40000000001L,0xb6db6d4830180L,0x4812900824800010L,
0x2092000040000082L,0x618000b659254a40L,0x86c301b6c0618018L,0xdb01860061860001L,
0x81861800075baed6L,0x186e381b70081cL,0xe56dc02072061860L,0x61201001200075b8L,
0x480000480492080L,0x52b5248201848040L,0x880812810012000bL,0x4004800004a4492L,
0xb529124a20204aL,0x49b68061201061a0L,0x8480418680018483L,0x1a000752ad26da01L,
0x4a349b6808128106L,0xa0204a0418680018L,0x492492497528d26dL,0x2492492492492492L,
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
@ -182,36 +182,36 @@ class Lev2ParametricDescription extends ParametricDescription {
// 0 -> [(0, 0)]
// 1 -> [(0, 2)]
// 2 -> [(0, 1)]
// 3 -> [(0, 2), (1, 2)]
// 4 -> [(0, 1), (1, 1)]
// 3 -> [(0, 1), (1, 1)]
// 4 -> [(0, 2), (1, 2)]
// 5 -> [(0, 2), (2, 1)]
// 6 -> [(0, 1), (2, 2)]
// 7 -> [(0, 2), (1, 2), (2, 2)]
// 8 -> [(0, 1), (2, 1)]
// 9 -> [(0, 2), (2, 2)]
// 10 -> [(0, 1), (1, 1), (2, 1)]
// 11 -> [(0, 2), (1, 2), (2, 2), (3, 2)]
// 12 -> [(0, 2), (2, 1), (3, 1)]
// 13 -> [(0, 2), (3, 2)]
// 14 -> [(0, 2), (2, 2), (3, 2)]
// 15 -> [(0, 2), (1, 2), (3, 1)]
// 16 -> [(0, 2), (1, 2), (3, 2)]
// 17 -> [(0, 1), (2, 2), (3, 2)]
// 18 -> [(0, 2), (3, 1)]
// 19 -> [(0, 1), (3, 2)]
// 20 -> [(0, 1), (1, 1), (3, 2)]
// 7 -> [(0, 2), (2, 2)]
// 8 -> [(0, 1), (1, 1), (2, 1)]
// 9 -> [(0, 2), (1, 2), (2, 2)]
// 10 -> [(0, 1), (2, 1)]
// 11 -> [(0, 2), (3, 2)]
// 12 -> [(0, 2), (1, 2), (3, 2)]
// 13 -> [(0, 2), (1, 2), (2, 2), (3, 2)]
// 14 -> [(0, 1), (2, 2), (3, 2)]
// 15 -> [(0, 2), (3, 1)]
// 16 -> [(0, 1), (3, 2)]
// 17 -> [(0, 1), (1, 1), (3, 2)]
// 18 -> [(0, 2), (1, 2), (3, 1)]
// 19 -> [(0, 2), (2, 2), (3, 2)]
// 20 -> [(0, 2), (2, 1), (3, 1)]
// 21 -> [(0, 2), (2, 1), (4, 2)]
// 22 -> [(0, 2), (1, 2), (4, 2)]
// 23 -> [(0, 2), (1, 2), (3, 2), (4, 2)]
// 24 -> [(0, 2), (2, 2), (4, 2)]
// 25 -> [(0, 2), (2, 2), (3, 2), (4, 2)]
// 26 -> [(0, 2), (3, 2), (4, 2)]
// 24 -> [(0, 2), (2, 2), (3, 2), (4, 2)]
// 25 -> [(0, 2), (3, 2), (4, 2)]
// 26 -> [(0, 2), (1, 2), (2, 2), (4, 2)]
// 27 -> [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
// 28 -> [(0, 2), (4, 2)]
// 29 -> [(0, 2), (1, 2), (2, 2), (4, 2)]
// 29 -> [(0, 2), (2, 2), (4, 2)]
public Lev2ParametricDescription(int w) {
super(w, 2, new int[] {0,2,1,1,0,-1,0,0,-1,0,-1,-1,-2,-1,-1,-2,-1,-1,-2,-1,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2});
super(w, 2, new int[] {0,2,1,0,1,-1,0,0,-1,0,-1,-1,-1,-1,-1,-2,-1,-1,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2});
}
}

View File

@ -0,0 +1,264 @@
package org.apache.lucene.util.automaton;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// The following code was generated with the moman/finenight pkg
// This package is available under the MIT License, see NOTICE.txt
// for more details.
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
/** Parametric description for generating a Levenshtein automaton of degree 2,
with transpositions as primitive edits */
class Lev2TParametricDescription extends ParametricDescription {
@Override
int transition(int absState, int position, int vector) {
// null absState should never be passed in
assert absState != -1;
// decode absState -> state, offset
int state = absState/(w+1);
int offset = absState%(w+1);
assert offset >= 0;
if (position == w) {
if (state < 3) {
final int loc = vector * 3 + state;
offset += unpack(offsetIncrs0, loc, 1);
state = unpack(toStates0, loc, 2)-1;
}
} else if (position == w-1) {
if (state < 5) {
final int loc = vector * 5 + state;
offset += unpack(offsetIncrs1, loc, 1);
state = unpack(toStates1, loc, 3)-1;
}
} else if (position == w-2) {
if (state < 13) {
final int loc = vector * 13 + state;
offset += unpack(offsetIncrs2, loc, 2);
state = unpack(toStates2, loc, 4)-1;
}
} else if (position == w-3) {
if (state < 28) {
final int loc = vector * 28 + state;
offset += unpack(offsetIncrs3, loc, 2);
state = unpack(toStates3, loc, 5)-1;
}
} else if (position == w-4) {
if (state < 45) {
final int loc = vector * 45 + state;
offset += unpack(offsetIncrs4, loc, 3);
state = unpack(toStates4, loc, 6)-1;
}
} else {
if (state < 45) {
final int loc = vector * 45 + state;
offset += unpack(offsetIncrs5, loc, 3);
state = unpack(toStates5, loc, 6)-1;
}
}
if (state == -1) {
// null state
return -1;
} else {
// translate back to abs
return state*(w+1)+offset;
}
}
// 1 vectors; 3 states per vector; array length = 3
private final static long[] toStates0 = new long[] /*2 bits per value */ {
0x23L
};
private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ {
0x0L
};
// 2 vectors; 5 states per vector; array length = 10
private final static long[] toStates1 = new long[] /*3 bits per value */ {
0x13688b44L
};
private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
0x3e0L
};
// 4 vectors; 13 states per vector; array length = 52
private final static long[] toStates2 = new long[] /*4 bits per value */ {
0x60dbb0b05200b504L,0x5233217627062227L,0x2355543214323235L,0x4354L
};
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
0x555080a800002000L,0x5555555555L
};
// 8 vectors; 28 states per vector; array length = 224
private final static long[] toStates3 = new long[] /*5 bits per value */ {
0xe701c02940059404L,0xa010162000a50000L,0xb02c8c40a1416288L,0xa821032310858c0L,
0x314423980d28b201L,0x5281e528847788e0L,0xa23980d308c2280eL,0x1e3294b1a962278cL,
0x8c41309e2288e528L,0x11444409021aca21L,0x11a4624886b1086bL,0x2a6258941d6240c4L,
0x5024a50b489074adL,0x14821aca520c411aL,0x5888b5890b594a44L,0x941d6520c411a465L,
0x8b589075ad6a62d4L,0x1a5055a4L
};
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
0x30c30200002000L,0x2a0030f3c3fc333cL,0x233a00328282a820L,0x5555555532b283a8L,
0x5555555555555555L,0x5555555555555555L,0x5555555555555555L
};
// 16 vectors; 45 states per vector; array length = 720
private final static long[] toStates4 = new long[] /*6 bits per value */ {
0x3801450002c5004L,0xc500014b00000e38L,0x51451401402L,0x0L,
0x518000b14010000L,0x9f1c20828e20230L,0x219f0df0830a70c2L,0x8200008208208200L,
0x805050160800800L,0x3082098602602643L,0x4564014250508064L,0x850051420000831L,
0x4140582085002082L,0x456180980990c201L,0x8316d0c50a01051L,0x21451420050df0e0L,
0xd14214014508214L,0x3c21c01850821c60L,0x1cb1403cb142087L,0x800821451851822cL,
0x20020820800020L,0xd006182087180345L,0xcb0a81cb24976b09L,0x8b1a60e624709d1L,
0x249082082249089L,0xc31421c600d2c024L,0x3c31451515454423L,0x31853c22c21cb140L,
0x4514500b2c208214L,0x8718034508b0051L,0xb2cb45515108f0c5L,0xe824715d1cb0a810L,
0x1422cb14908b0e60L,0x30812c22c02cb145L,0x842022020cb1420cL,0x5c20ce0820ce0850L,
0x208208208b0d70c2L,0x4208508214214208L,0x920834050830c20L,0xc6134dc613653592L,
0xd309341c6dc4db4dL,0x6424d90854d34d34L,0x92072c22030814c2L,0x4220724b24a30930L,
0x2470d72025c920e2L,0x92c92d70975c9082L,0xcb0880c204924e08L,0x45739728c24c2481L,
0xc6da4db5da6174daL,0x4b5d35d75d30971dL,0x1030815c93825ce2L,0x51442051020cb145L,
0xc538210e2c220e2cL,0x851421452cb0d70L,0x204b085085145142L,0x921560834051440cL,
0x4d660e4da60e6595L,0x94d914e41c6dc658L,0x826426591454d365L,0x2892072c51030813L,
0xe2c22072cb2ca30bL,0x452c70d720538910L,0x8b2cb2d708e3891L,0x81cb1440c204b24eL,
0xda44e38e28c2ca24L,0x1dc6da6585d660e4L,0xe2cb5d338e5d914eL,0x38938238L
};
private final static long[] offsetIncrs4 = new long[] /*3 bits per value */ {
0x3002000000080000L,0x20c060L,0x8149000004000000L,0x4024924110824824L,
0xdb6030c360002082L,0x6c36c06c301b0d80L,0xb01861b0000db0dbL,0x1b7036209188e06dL,
0x800920006d86db7L,0x4920c2402402490L,0x49000208249009L,0x4908128128124804L,
0x34800104124a44a2L,0xc30930900d24020cL,0x40009a0924c24d24L,0x4984a069201061aL,
0x494d049271269262L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
0x2492492492492492L,0x249249249249L
};
// 32 vectors; 45 states per vector; array length = 1440
private final static long[] toStates5 = new long[] /*6 bits per value */ {
0x3801450002c5004L,0xc500014b00000e38L,0x51451401402L,0x0L,
0x514000b14010000L,0x550000038e00e0L,0x264518500600b180L,0x8208208208208208L,
0x2c50040820820L,0x70820a38808c0146L,0xc37c20c29c30827cL,0x20820820800867L,
0xb140102002002080L,0x828e202300518000L,0x830a70c209f1c20L,0x51451450853df0dfL,
0x1614214214508214L,0x6026026430805050L,0x2505080643082098L,0x4200008314564014L,
0x850020820850051L,0x80990c2014140582L,0x8201920208261809L,0x892051990060941L,
0x22492492c22cb242L,0x430805050162492cL,0x8041451586026026L,0x37c38020c5b43142L,
0x4208508514508014L,0x141405850850051L,0x51456180980990c2L,0xe008316d0c50a010L,
0x2c52cb2c508b21f0L,0x600d2c92c22cb249L,0x873c21c01850821cL,0x2c01cb1403cb1420L,
0x2080082145185182L,0x4500200208208000L,0x870061420871803L,0x740500f5050821cfL,
0x934d964618609000L,0x4c24d34d30824d30L,0x1860821c600d642L,0xc2a072c925dac274L,
0x2c69839891c27472L,0x9242082089242242L,0x8208718034b00900L,0x1cb24976b09d0061L,
0x60e624709d1cb0a8L,0xd31455d71574ce3eL,0x1c600d3825c25d74L,0x51515454423c3142L,
0xc22c21cb1403c314L,0xb2c20821431853L,0x34508b005145145L,0x5515108f0c508718L,
0x8740500f2051454L,0xe2534d920618f090L,0x493826596592c238L,0x4423c31421c600d6L,
0x72c2a042cb2d1545L,0x422c3983a091c574L,0xb2c514508b2c52L,0xf0c508718034b08bL,
0xa810b2cb45515108L,0x2260e824715d1cb0L,0xe6592c538e2d74ceL,0x420c308138938238L,
0x850842022020cb1L,0x70c25c20ce0820ceL,0x4208208208208b0dL,0xc20420850821421L,
0x21080880832c5083L,0xa50838820838c214L,0xaaaaaaaaa9c39430L,0x1aaa7eaa9fa9faaaL,
0x824820d01420c308L,0x7184d37184d94d64L,0x34c24d071b7136d3L,0x990936421534d34dL,
0x834050830c20530L,0x34dc613653592092L,0xa479c6dc4db4dc61L,0x920a9f924924924aL,
0x72c220308192a82aL,0x724b24a30930920L,0xd72025c920e2422L,0x92d70975c9082247L,
0x880c204924e0892cL,0x2c928c24c2481cb0L,0x80a5248889088749L,0x6a861b2aaac74394L,
0x81b2ca6ab27b278L,0xa3093092072c2203L,0xd76985d36915ce5cL,0x5d74c25c771b6936L,
0x724e0973892d74d7L,0x4c2481cb0880c205L,0x6174da45739728c2L,0x4aa175c6da4db5daL,
0x6a869b2786486186L,0xcb14510308186caL,0x220e2c5144205102L,0xcb0d70c538210e2cL,
0x1451420851421452L,0x51440c204b085085L,0xcb1451081440832cL,0x94316208488b0888L,
0xfaaa7dfa9f7e79c3L,0x30819ea7ea7df7dL,0x6564855820d01451L,0x9613598393698399L,
0xd965364539071b71L,0x4e0990996451534L,0x21560834051440c2L,0xd660e4da60e65959L,
0x9207e979c6dc6584L,0xa82a8207df924820L,0x892072c5103081a6L,0x2c22072cb2ca30b2L,
0x52c70d720538910eL,0x8b2cb2d708e38914L,0x1cb1440c204b24e0L,0x874b2cb28c2ca248L,
0x4394816224488b08L,0x9e786aa69b1f7e77L,0x51030819eca6a9e7L,0x8e38a30b2892072cL,
0x6996175983936913L,0x74ce39764538771bL,0xc204e24e08e38b2dL,0x28c2ca2481cb1440L,
0x85d660e4da44e38eL,0x698607e975c6da65L,0xa6ca6aa699e7864aL
};
private final static long[] offsetIncrs5 = new long[] /*3 bits per value */ {
0x3002000000080000L,0x20c060L,0x100000004000000L,0xdb6db6db50603018L,
0xa480000200002db6L,0x1249208841241240L,0x4000010000104120L,0x2492c42092092052L,
0xc30d800096592d9L,0xb01b0c06c36036d8L,0x186c00036c36db0dL,0xad860361b01b6c06L,
0x360001b75b6dd6ddL,0xc412311c0db6030cL,0xdb0db6e36e06L,0x9188e06db01861bL,
0x6dd6db71b72b62L,0x4024024900800920L,0x20824900904920c2L,0x1201248040049000L,
0x5524ad4aa4906120L,0x4092402002480015L,0x9252251248409409L,0x4920100124000820L,
0x29128924204a04a0L,0x900830d200055549L,0x934930c24c24034L,0x418690002682493L,
0x9a49861261201a48L,0xc348001355249d4L,0x24c40930940d2402L,0x1a40009a0924e24dL,
0x6204984a06920106L,0x92494d5492712692L,0x4924924924924924L,0x2492492492492492L,
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x24924924L
};
// state map
// 0 -> [(0, 0)]
// 1 -> [(0, 2)]
// 2 -> [(0, 1)]
// 3 -> [(0, 1), (1, 1)]
// 4 -> [(0, 2), (1, 2)]
// 5 -> [t(0, 2), (0, 2), (1, 2), (2, 2)]
// 6 -> [(0, 2), (2, 1)]
// 7 -> [(0, 1), (2, 2)]
// 8 -> [(0, 2), (2, 2)]
// 9 -> [(0, 1), (1, 1), (2, 1)]
// 10 -> [(0, 2), (1, 2), (2, 2)]
// 11 -> [(0, 1), (2, 1)]
// 12 -> [t(0, 1), (0, 1), (1, 1), (2, 1)]
// 13 -> [(0, 2), (1, 2), (2, 2), (3, 2)]
// 14 -> [t(0, 2), (0, 2), (1, 2), (2, 2), (3, 2)]
// 15 -> [(0, 2), t(1, 2), (1, 2), (2, 2), (3, 2)]
// 16 -> [(0, 2), (2, 1), (3, 1)]
// 17 -> [(0, 1), t(1, 2), (2, 2), (3, 2)]
// 18 -> [(0, 2), (3, 2)]
// 19 -> [(0, 2), (1, 2), t(1, 2), (2, 2), (3, 2)]
// 20 -> [t(0, 2), (0, 2), (1, 2), (3, 1)]
// 21 -> [(0, 1), (1, 1), (3, 2)]
// 22 -> [(0, 2), (2, 2), (3, 2)]
// 23 -> [(0, 2), (1, 2), (3, 1)]
// 24 -> [(0, 2), (1, 2), (3, 2)]
// 25 -> [(0, 1), (2, 2), (3, 2)]
// 26 -> [(0, 2), (3, 1)]
// 27 -> [(0, 1), (3, 2)]
// 28 -> [(0, 2), (2, 1), (4, 2)]
// 29 -> [(0, 2), t(1, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
// 30 -> [(0, 2), (1, 2), (4, 2)]
// 31 -> [(0, 2), (1, 2), (3, 2), (4, 2)]
// 32 -> [(0, 2), (2, 2), (3, 2), (4, 2)]
// 33 -> [(0, 2), (1, 2), t(2, 2), (2, 2), (3, 2), (4, 2)]
// 34 -> [(0, 2), (1, 2), (2, 2), t(2, 2), (3, 2), (4, 2)]
// 35 -> [(0, 2), (3, 2), (4, 2)]
// 36 -> [(0, 2), t(2, 2), (2, 2), (3, 2), (4, 2)]
// 37 -> [t(0, 2), (0, 2), (1, 2), (2, 2), (4, 2)]
// 38 -> [(0, 2), (1, 2), (2, 2), (4, 2)]
// 39 -> [t(0, 2), (0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
// 40 -> [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
// 41 -> [(0, 2), (4, 2)]
// 42 -> [t(0, 2), (0, 2), (1, 2), (2, 2), t(2, 2), (3, 2), (4, 2)]
// 43 -> [(0, 2), (2, 2), (4, 2)]
// 44 -> [(0, 2), (1, 2), t(1, 2), (2, 2), (3, 2), (4, 2)]
public Lev2TParametricDescription(int w) {
super(w, 2, new int[] {0,2,1,0,1,0,-1,0,0,-1,0,-1,-1,-1,-1,-1,-2,-1,-1,-1,-2,-1,-1,-2,-1,-1,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2});
}
}

View File

@ -47,8 +47,9 @@ public class LevenshteinAutomata {
/**
* Create a new LevenshteinAutomata for some input String.
* Optionally count transpositions as a primitive edit.
*/
public LevenshteinAutomata(String input) {
public LevenshteinAutomata(String input, boolean withTranspositions) {
this.input = input;
int length = Character.codePointCount(input, 0, input.length());
word = new int[length];
@ -88,8 +89,8 @@ public class LevenshteinAutomata {
descriptions = new ParametricDescription[] {
null, /* for n=0, we do not need to go through the trouble */
new Lev1ParametricDescription(word.length),
new Lev2ParametricDescription(word.length),
withTranspositions ? new Lev1TParametricDescription(word.length) : new Lev1ParametricDescription(word.length),
withTranspositions ? new Lev2TParametricDescription(word.length) : new Lev2ParametricDescription(word.length),
};
}

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Note, this file is known to work with rev 115 of the moman
# Note, this file is known to work with rev 120 of the moman
# repository (http://bitbucket.org/jpbarrette/moman/overview)
#
# See also: http://sites.google.com/site/rrettesite/moman
@ -95,9 +95,9 @@ def charVarNumber(charVar):
def main():
if len(sys.argv) != 2:
if len(sys.argv) != 3:
print
print 'Usage: python -u %s N' % sys.argv[0]
print 'Usage: python -u %s N <True/False>' % sys.argv[0]
print
print 'NOTE: the resulting .java file is created in the current working dir!'
print
@ -105,7 +105,9 @@ def main():
n = int(sys.argv[1])
tables = genTransitions(n)
transpose = (sys.argv[2] == "True")
tables = genTransitions(n, transpose)
stateMap = {}
@ -142,8 +144,13 @@ def main():
w('')
w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
w('')
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
className = 'Lev%dParametricDescription' % n
if transpose:
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
w(' with transpositions as primitive edits */')
className = 'Lev%dTParametricDescription' % n
else:
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
className = 'Lev%dParametricDescription' % n
w('class %s extends ParametricDescription {' % className)
@ -201,9 +208,6 @@ def main():
byAction = {}
for s, (toS, offset) in l:
state = str(s)
if state == '[]':
# don't waste code on the null state
continue
toState = str(toS)
if state not in stateMap:
@ -213,7 +217,7 @@ def main():
byFromState[stateMap[state]] = (1+stateMap[toState], offset)
fromStateDesc = ', '.join([str(x) for x in eval(s)])
fromStateDesc = s[1:len(s)-1]
toStateDesc = ', '.join([str(x) for x in toS])
tup = (stateMap[toState], toStateDesc, offset)
@ -222,10 +226,10 @@ def main():
byAction[tup].append((fromStateDesc, stateMap[state]))
if numCasesPerVector is None:
numCasesPerVector = len(l)-1
numCasesPerVector = len(l)
else:
# we require this to be uniform... empirically it seems to be!
assert numCasesPerVector == len(l)-1
assert numCasesPerVector == len(l)
if MODE == 'array':
@ -320,7 +324,10 @@ def main():
minErrors = []
for i in xrange(len(stateMap2)-1):
w('// %s -> %s' % (i, stateMap2[i]))
v = eval(stateMap2[i])
# we replace t-notation as its not relevant here
st = stateMap2[i].replace('t', '')
v = eval(st)
minError = min([-i+e for i, e in v])
c = len(v)
sum += c

View File

@ -41,7 +41,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
// LUCENE-3094
public void testNoWastedStates() throws Exception {
AutomatonTestUtil.assertNoDetachedStates(new LevenshteinAutomata("abc").toAutomaton(1));
AutomatonTestUtil.assertNoDetachedStates(new LevenshteinAutomata("abc", false).toAutomaton(1));
}
/**
@ -64,31 +64,46 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
* up to some maximum distance.
*/
private void assertLev(String s, int maxDistance) {
LevenshteinAutomata builder = new LevenshteinAutomata(s);
LevenshteinAutomata builder = new LevenshteinAutomata(s, false);
LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true);
Automaton automata[] = new Automaton[maxDistance + 1];
Automaton tautomata[] = new Automaton[maxDistance + 1];
for (int n = 0; n < automata.length; n++) {
automata[n] = builder.toAutomaton(n);
tautomata[n] = tbuilder.toAutomaton(n);
assertNotNull(automata[n]);
assertNotNull(tautomata[n]);
assertTrue(automata[n].isDeterministic());
assertTrue(tautomata[n].isDeterministic());
assertTrue(SpecialOperations.isFinite(automata[n]));
assertTrue(SpecialOperations.isFinite(tautomata[n]));
AutomatonTestUtil.assertNoDetachedStates(automata[n]);
AutomatonTestUtil.assertNoDetachedStates(tautomata[n]);
// check that the dfa for n-1 accepts a subset of the dfa for n
if (n > 0) {
assertTrue(automata[n-1].subsetOf(automata[n]));
assertTrue(automata[n-1].subsetOf(tautomata[n]));
assertTrue(tautomata[n-1].subsetOf(automata[n]));
assertTrue(tautomata[n-1].subsetOf(tautomata[n]));
assertNotSame(automata[n-1], automata[n]);
}
// check that Lev(N) is a subset of LevT(N)
assertTrue(automata[n].subsetOf(tautomata[n]));
// special checks for specific n
switch(n) {
case 0:
// easy, matches the string itself
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), automata[0]));
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), tautomata[0]));
break;
case 1:
// generate a lev1 naively, and check the accepted lang is the same.
assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1]));
assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), tautomata[1]));
break;
default:
assertBruteForce(s, automata[n], n);
assertBruteForceT(s, tautomata[n], n);
break;
}
}
@ -110,6 +125,17 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
return a;
}
/**
* Return an automaton that accepts all 1-character insertions, deletions,
* substitutions, and transpositions of s.
*/
private Automaton naiveLev1T(String s) {
Automaton a = naiveLev1(s);
a = BasicOperations.union(a, transpositionsOf(s));
MinimizationOperations.minimize(a);
return a;
}
/**
* Return an automaton that accepts all 1-character insertions of s (inserting
* one character)
@ -170,6 +196,29 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
return a;
}
/**
* Return an automaton that accepts all transpositions of s
* (transposing two adjacent characters)
*/
private Automaton transpositionsOf(String s) {
if (s.length() < 2)
return BasicAutomata.makeEmpty();
List<Automaton> list = new ArrayList<Automaton>();
for (int i = 0; i < s.length()-1; i++) {
StringBuilder sb = new StringBuilder();
sb.append(s.substring(0, i));
sb.append(s.charAt(i+1));
sb.append(s.charAt(i));
sb.append(s.substring(i+2, s.length()));
String st = sb.toString();
if (!st.equals(s))
list.add(BasicAutomata.makeString(st));
}
Automaton a = BasicOperations.union(list);
MinimizationOperations.minimize(a);
return a;
}
private void assertBruteForce(String input, Automaton dfa, int distance) {
CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
int maxLen = input.length() + distance + 1;
@ -185,6 +234,21 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
}
}
private void assertBruteForceT(String input, Automaton dfa, int distance) {
CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
int maxLen = input.length() + distance + 1;
int maxNum = (int) Math.pow(2, maxLen);
for (int i = 0; i < maxNum; i++) {
String encoded = Integer.toString(i, 2);
boolean accepts = ra.run(encoded);
if (accepts) {
assertTrue(getTDistance(input, encoded) <= distance);
} else {
assertTrue(getTDistance(input, encoded) > distance);
}
}
}
//*****************************
// Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
//*****************************
@ -260,4 +324,58 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
// actually has the most recent cost counts
return Math.abs(p[n]);
}
private int getTDistance(String target, String other) {
char[] sa;
int n;
int d[][]; // cost array
sa = target.toCharArray();
n = sa.length;
final int m = other.length();
d = new int[n+1][m+1];
if (n == 0 || m == 0) {
if (n == m) {
return 0;
}
else {
return Math.max(n, m);
}
}
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
char t_j; // jth character of t
int cost; // cost
for (i = 0; i<=n; i++) {
d[i][0] = i;
}
for (j = 0; j<=m; j++) {
d[0][j] = j;
}
for (j = 1; j<=m; j++) {
t_j = other.charAt(j-1);
for (i=1; i<=n; i++) {
cost = sa[i-1]==t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
// transposition
if (i > 1 && j > 1 && target.charAt(i-1) == other.charAt(j-2) && target.charAt(i-2) == other.charAt(j-1)) {
d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
}
}
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return Math.abs(d[n][m]);
}
}

View File

@ -56,20 +56,11 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
* @lucene.experimental
*/
public class DirectSpellChecker {
/** The default StringDistance, Levenshtein distance implemented internally
/** The default StringDistance, Damerau-Levenshtein distance implemented internally
* via {@link LevenshteinAutomata}.
* <p>
* Note: this is the fastest distance metric, because Levenshtein is used
* Note: this is the fastest distance metric, because Damerau-Levenshtein is used
* to draw candidates from the term dictionary: this just re-uses the scoring.
* <p>
* Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
* <ul>
* <li> This metric treats full unicode codepoints as characters, but
* LevenshteinDistance calculates based on UTF-16 code units.
* <li> This metric scales raw edit distances into a floating point score
* differently than LevenshteinDistance: the scaling is based upon the
* shortest of the two terms instead of the longest.
* </ul>
*/
public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
@ -277,8 +268,8 @@ public class DirectSpellChecker {
* Set the string distance metric.
* The default is {@link #INTERNAL_LEVENSHTEIN}
* <p>
* Note: because this spellchecker draws its candidates from the
* term dictionary using Levenshtein, it works best with an edit-distance-like
* Note: because this spellchecker draws its candidates from the term
* dictionary using Damerau-Levenshtein, it works best with an edit-distance-like
* string metric. If you use a different metric than the default,
* you might want to consider increasing {@link #setMaxInspections(int)}
* to draw more candidates for your metric to rank.
@ -401,7 +392,7 @@ public class DirectSpellChecker {
if (terms == null) {
return Collections.emptyList();
}
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1));
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
BytesRef queryTerm = new BytesRef(term.text());

View File

@ -20,16 +20,22 @@ package org.apache.lucene.search.spell;
import org.apache.lucene.util.IntsRef;
/**
* Levenshtein implemented in a consistent way as Lucene's FuzzyTermsEnum.
* Damerau-Levenshtein (optimal string alignment) implemented in a consistent
* way as Lucene's FuzzyTermsEnum with the transpositions option enabled.
*
* Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
* Notes:
* <ul>
* <li> This metric treats full unicode codepoints as characters, but
* LevenshteinDistance calculates based on UTF-16 code units.
* <li> This metric treats full unicode codepoints as characters
* <li> This metric scales raw edit distances into a floating point score
* differently than LevenshteinDistance: the scaling is based upon the
* shortest of the two terms instead of the longest.
* based upon the shortest of the two terms
* <li> Transpositions of two adjacent codepoints are treated as primitive
* edits.
* <li> Edits are applied in parallel: for example, "ab" and "bca" have
* distance 3.
* </ul>
*
* NOTE: this class is not particularly efficient. It is only intended
* for merging results from multiple DirectSpellCheckers.
*/
public final class LuceneLevenshteinDistance implements StringDistance {
@ -38,28 +44,24 @@ public final class LuceneLevenshteinDistance implements StringDistance {
IntsRef targetPoints;
IntsRef otherPoints;
int n;
int p[]; //'previous' cost array, horizontally
int d[]; // cost array, horizontally
int _d[]; //placeholder to assist in swapping p and d
int d[][]; // cost array
// cheaper to do this up front once
targetPoints = toIntsRef(target);
otherPoints = toIntsRef(other);
n = targetPoints.length;
p = new int[n+1];
d = new int[n+1];
final int m = otherPoints.length;
d = new int[n+1][m+1];
if (n == 0 || m == 0) {
if (n == m) {
return 1;
}
else {
return 0;
}
else {
return Math.max(n, m);
}
}
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
@ -68,29 +70,29 @@ public final class LuceneLevenshteinDistance implements StringDistance {
int cost; // cost
for (i = 0; i <= n; i++) {
p[i] = i;
for (i = 0; i<=n; i++) {
d[i][0] = i;
}
for (j = 0; j<=m; j++) {
d[0][j] = j;
}
for (j = 1; j <= m; j++) {
t_j = otherPoints.ints[j - 1];
d[0] = j;
for (j = 1; j<=m; j++) {
t_j = otherPoints.ints[j-1];
for (i=1; i <= n; i++) {
cost = targetPoints.ints[i - 1] == t_j ? 0 : 1;
for (i=1; i<=n; i++) {
cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
// transposition
if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
}
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return 1.0f - ((float) p[n] / Math.min(m, n));
return 1.0f - ((float) d[n][m] / Math.min(m, n));
}
private static IntsRef toIntsRef(String s) {

View File

@ -214,4 +214,54 @@ public class TestDirectSpellChecker extends LuceneTestCase {
writer.close();
dir.close();
}
// simple test that transpositions work, we suggest five for fvie with ed=1
public void testTransposition() throws Exception {
DirectSpellChecker spellChecker = new DirectSpellChecker();
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, dir,
new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
for (int i = 0; i < 20; i++) {
Document doc = new Document();
doc.add(newField("numbers", English.intToEnglish(i), TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
"numbers", "fvie"), 1, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length);
assertEquals("five", similar[0].string);
ir.close();
writer.close();
dir.close();
}
// simple test that transpositions work, we suggest seventeen for seevntene with ed=2
public void testTransposition2() throws Exception {
DirectSpellChecker spellChecker = new DirectSpellChecker();
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, dir,
new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
for (int i = 0; i < 20; i++) {
Document doc = new Document();
doc.add(newField("numbers", English.intToEnglish(i), TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
"numbers", "seevntene"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length);
assertEquals("seventeen", similar[0].string);
ir.close();
writer.close();
dir.close();
}
}