mirror of https://github.com/apache/lucene.git
LUCENE-3662: extend LevenshteinAutomata to support transpositions as primitive a edit
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1224817 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
17438c9eac
commit
b096910a82
|
@ -396,6 +396,10 @@ New features
|
||||||
and FuzzyQuery with finite-state methods. Adds RegexpQuery.
|
and FuzzyQuery with finite-state methods. Adds RegexpQuery.
|
||||||
(Robert Muir, Mike McCandless, Uwe Schindler, Mark Miller)
|
(Robert Muir, Mike McCandless, Uwe Schindler, Mark Miller)
|
||||||
|
|
||||||
|
* LUCENE-3662: Add support for levenshtein distance with transpositions
|
||||||
|
to LevenshteinAutomata, FuzzyTermsEnum, and DirectSpellChecker.
|
||||||
|
(Jean-Philippe Barrette-LaPierre, Robert Muir)
|
||||||
|
|
||||||
* LUCENE-2321: Cutover to a more RAM efficient packed-ints based
|
* LUCENE-2321: Cutover to a more RAM efficient packed-ints based
|
||||||
representation for the in-memory terms dict index. (Mike
|
representation for the in-memory terms dict index. (Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
|
@ -480,7 +480,11 @@
|
||||||
<sequential>
|
<sequential>
|
||||||
<exec dir="src/java/org/apache/lucene/util/automaton"
|
<exec dir="src/java/org/apache/lucene/util/automaton"
|
||||||
executable="${python.exe}" failonerror="true">
|
executable="${python.exe}" failonerror="true">
|
||||||
<arg line="createLevAutomata.py @{n}"/>
|
<arg line="createLevAutomata.py @{n} True"/>
|
||||||
|
</exec>
|
||||||
|
<exec dir="src/java/org/apache/lucene/util/automaton"
|
||||||
|
executable="${python.exe}" failonerror="true">
|
||||||
|
<arg line="createLevAutomata.py @{n} False"/>
|
||||||
</exec>
|
</exec>
|
||||||
</sequential>
|
</sequential>
|
||||||
</macrodef>
|
</macrodef>
|
||||||
|
|
|
@ -194,7 +194,7 @@
|
||||||
|
|
||||||
<property name="hg.exe" value="hg" />
|
<property name="hg.exe" value="hg" />
|
||||||
<property name="moman.url" value="https://bitbucket.org/jpbarrette/moman" />
|
<property name="moman.url" value="https://bitbucket.org/jpbarrette/moman" />
|
||||||
<property name="moman.rev" value="115" />
|
<property name="moman.rev" value="120" />
|
||||||
<property name="python.exe" value="python" />
|
<property name="python.exe" value="python" />
|
||||||
|
|
||||||
<property name="gpg.exe" value="gpg" />
|
<property name="gpg.exe" value="gpg" />
|
||||||
|
|
|
@ -211,7 +211,7 @@ public class FuzzyLikeThisQuery extends Query
|
||||||
AttributeSource atts = new AttributeSource();
|
AttributeSource atts = new AttributeSource();
|
||||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||||
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||||
FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength);
|
FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength, false);
|
||||||
//store the df so all variants use same idf
|
//store the df so all variants use same idf
|
||||||
int df = reader.docFreq(startTerm);
|
int df = reader.docFreq(startTerm);
|
||||||
int numVariants=0;
|
int numVariants=0;
|
||||||
|
|
|
@ -141,7 +141,10 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
if (!termLongEnough) { // can only match if it's exact
|
if (!termLongEnough) { // can only match if it's exact
|
||||||
return new SingleTermsEnum(terms.iterator(null), term.bytes());
|
return new SingleTermsEnum(terms.iterator(null), term.bytes());
|
||||||
}
|
}
|
||||||
return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength);
|
// TODO: should we expose the transpositions option to this query?
|
||||||
|
// maybe move the old/slowish stuff (lev without transpositions, n > 2, etc) all to contrib,
|
||||||
|
// deprecate it, and just have a faster/simpler/better one in core?
|
||||||
|
return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -80,6 +80,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
private final int termText[];
|
private final int termText[];
|
||||||
private final int realPrefixLength;
|
private final int realPrefixLength;
|
||||||
|
|
||||||
|
private final boolean transpositions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
|
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
|
||||||
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
|
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
|
||||||
|
@ -98,7 +100,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
|
public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
|
||||||
final float minSimilarity, final int prefixLength) throws IOException {
|
final float minSimilarity, final int prefixLength, boolean transpositions) throws IOException {
|
||||||
if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
|
if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
|
||||||
throw new IllegalArgumentException("fractional edit distances are not allowed");
|
throw new IllegalArgumentException("fractional edit distances are not allowed");
|
||||||
if (minSimilarity < 0.0f)
|
if (minSimilarity < 0.0f)
|
||||||
|
@ -130,6 +132,11 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
maxEdits = initialMaxDistance(this.minSimilarity, termLength);
|
maxEdits = initialMaxDistance(this.minSimilarity, termLength);
|
||||||
raw = false;
|
raw = false;
|
||||||
}
|
}
|
||||||
|
if (transpositions && maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||||
|
throw new UnsupportedOperationException("with transpositions enabled, distances > "
|
||||||
|
+ LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported ");
|
||||||
|
}
|
||||||
|
this.transpositions = transpositions;
|
||||||
this.scale_factor = 1.0f / (1.0f - this.minSimilarity);
|
this.scale_factor = 1.0f / (1.0f - this.minSimilarity);
|
||||||
|
|
||||||
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||||
|
@ -162,7 +169,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
if (runAutomata.size() <= maxDistance &&
|
if (runAutomata.size() <= maxDistance &&
|
||||||
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||||
LevenshteinAutomata builder =
|
LevenshteinAutomata builder =
|
||||||
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength));
|
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);
|
||||||
|
|
||||||
for (int i = runAutomata.size(); i <= maxDistance; i++) {
|
for (int i = runAutomata.size(); i <= maxDistance; i++) {
|
||||||
Automaton a = builder.toAutomaton(i);
|
Automaton a = builder.toAutomaton(i);
|
||||||
|
|
|
@ -89,26 +89,26 @@ class Lev1ParametricDescription extends ParametricDescription {
|
||||||
|
|
||||||
// 4 vectors; 5 states per vector; array length = 20
|
// 4 vectors; 5 states per vector; array length = 20
|
||||||
private final static long[] toStates2 = new long[] /*3 bits per value */ {
|
private final static long[] toStates2 = new long[] /*3 bits per value */ {
|
||||||
0x4da292442420003L
|
0x69a292450428003L
|
||||||
};
|
};
|
||||||
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
|
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
|
||||||
0x5555528000L
|
0x5555588000L
|
||||||
};
|
};
|
||||||
|
|
||||||
// 8 vectors; 5 states per vector; array length = 40
|
// 8 vectors; 5 states per vector; array length = 40
|
||||||
private final static long[] toStates3 = new long[] /*3 bits per value */ {
|
private final static long[] toStates3 = new long[] /*3 bits per value */ {
|
||||||
0x14d0812112018003L,0xb1a29b46d48a49L
|
0x1690a82152018003L,0xb1a2d346448a49L
|
||||||
};
|
};
|
||||||
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
|
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
|
||||||
0x555555e80a0f0000L,0x5555L
|
0x555555b8220f0000L,0x5555L
|
||||||
};
|
};
|
||||||
|
|
||||||
// state map
|
// state map
|
||||||
// 0 -> [(0, 0)]
|
// 0 -> [(0, 0)]
|
||||||
// 1 -> [(0, 1)]
|
// 1 -> [(0, 1)]
|
||||||
// 2 -> [(0, 1), (1, 1)]
|
// 2 -> [(0, 1), (1, 1)]
|
||||||
// 3 -> [(0, 1), (1, 1), (2, 1)]
|
// 3 -> [(0, 1), (2, 1)]
|
||||||
// 4 -> [(0, 1), (2, 1)]
|
// 4 -> [(0, 1), (1, 1), (2, 1)]
|
||||||
|
|
||||||
|
|
||||||
public Lev1ParametricDescription(int w) {
|
public Lev1ParametricDescription(int w) {
|
||||||
|
|
|
@ -0,0 +1,119 @@
|
||||||
|
package org.apache.lucene.util.automaton;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// The following code was generated with the moman/finenight pkg
|
||||||
|
// This package is available under the MIT License, see NOTICE.txt
|
||||||
|
// for more details.
|
||||||
|
|
||||||
|
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
|
||||||
|
|
||||||
|
/** Parametric description for generating a Levenshtein automaton of degree 1,
|
||||||
|
with transpositions as primitive edits */
|
||||||
|
class Lev1TParametricDescription extends ParametricDescription {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int transition(int absState, int position, int vector) {
|
||||||
|
// null absState should never be passed in
|
||||||
|
assert absState != -1;
|
||||||
|
|
||||||
|
// decode absState -> state, offset
|
||||||
|
int state = absState/(w+1);
|
||||||
|
int offset = absState%(w+1);
|
||||||
|
assert offset >= 0;
|
||||||
|
|
||||||
|
if (position == w) {
|
||||||
|
if (state < 2) {
|
||||||
|
final int loc = vector * 2 + state;
|
||||||
|
offset += unpack(offsetIncrs0, loc, 1);
|
||||||
|
state = unpack(toStates0, loc, 2)-1;
|
||||||
|
}
|
||||||
|
} else if (position == w-1) {
|
||||||
|
if (state < 3) {
|
||||||
|
final int loc = vector * 3 + state;
|
||||||
|
offset += unpack(offsetIncrs1, loc, 1);
|
||||||
|
state = unpack(toStates1, loc, 2)-1;
|
||||||
|
}
|
||||||
|
} else if (position == w-2) {
|
||||||
|
if (state < 6) {
|
||||||
|
final int loc = vector * 6 + state;
|
||||||
|
offset += unpack(offsetIncrs2, loc, 2);
|
||||||
|
state = unpack(toStates2, loc, 3)-1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (state < 6) {
|
||||||
|
final int loc = vector * 6 + state;
|
||||||
|
offset += unpack(offsetIncrs3, loc, 2);
|
||||||
|
state = unpack(toStates3, loc, 3)-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state == -1) {
|
||||||
|
// null state
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
// translate back to abs
|
||||||
|
return state*(w+1)+offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1 vectors; 2 states per vector; array length = 2
|
||||||
|
private final static long[] toStates0 = new long[] /*2 bits per value */ {
|
||||||
|
0x2L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ {
|
||||||
|
0x0L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 2 vectors; 3 states per vector; array length = 6
|
||||||
|
private final static long[] toStates1 = new long[] /*2 bits per value */ {
|
||||||
|
0xa43L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
|
||||||
|
0x38L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 4 vectors; 6 states per vector; array length = 24
|
||||||
|
private final static long[] toStates2 = new long[] /*3 bits per value */ {
|
||||||
|
0x3453491482140003L,0x6dL
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
|
||||||
|
0x555555a20000L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 8 vectors; 6 states per vector; array length = 48
|
||||||
|
private final static long[] toStates3 = new long[] /*3 bits per value */ {
|
||||||
|
0x21520854900c0003L,0x5b4d19a24534916dL,0xda34L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
|
||||||
|
0x5555ae0a20fc0000L,0x55555555L
|
||||||
|
};
|
||||||
|
|
||||||
|
// state map
|
||||||
|
// 0 -> [(0, 0)]
|
||||||
|
// 1 -> [(0, 1)]
|
||||||
|
// 2 -> [(0, 1), (1, 1)]
|
||||||
|
// 3 -> [(0, 1), (2, 1)]
|
||||||
|
// 4 -> [t(0, 1), (0, 1), (1, 1), (2, 1)]
|
||||||
|
// 5 -> [(0, 1), (1, 1), (2, 1)]
|
||||||
|
|
||||||
|
|
||||||
|
public Lev1TParametricDescription(int w) {
|
||||||
|
super(w, 1, new int[] {0,1,0,-1,-1,-1});
|
||||||
|
}
|
||||||
|
}
|
|
@ -93,7 +93,7 @@ class Lev2ParametricDescription extends ParametricDescription {
|
||||||
|
|
||||||
// 2 vectors; 5 states per vector; array length = 10
|
// 2 vectors; 5 states per vector; array length = 10
|
||||||
private final static long[] toStates1 = new long[] /*3 bits per value */ {
|
private final static long[] toStates1 = new long[] /*3 bits per value */ {
|
||||||
0x1a68c105L
|
0x13688b44L
|
||||||
};
|
};
|
||||||
private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
|
private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
|
||||||
0x3e0L
|
0x3e0L
|
||||||
|
@ -101,41 +101,41 @@ class Lev2ParametricDescription extends ParametricDescription {
|
||||||
|
|
||||||
// 4 vectors; 11 states per vector; array length = 44
|
// 4 vectors; 11 states per vector; array length = 44
|
||||||
private final static long[] toStates2 = new long[] /*4 bits per value */ {
|
private final static long[] toStates2 = new long[] /*4 bits per value */ {
|
||||||
0x6280b80804280405L,0x2323432321608282L,0x523434543213L
|
0x26a09a0a0520a504L,0x2323523321a260a2L,0x354235543213L
|
||||||
};
|
};
|
||||||
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
|
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
|
||||||
0x5555502220000800L,0x555555L
|
0x5555520280000800L,0x555555L
|
||||||
};
|
};
|
||||||
|
|
||||||
// 8 vectors; 21 states per vector; array length = 168
|
// 8 vectors; 21 states per vector; array length = 168
|
||||||
private final static long[] toStates3 = new long[] /*5 bits per value */ {
|
private final static long[] toStates3 = new long[] /*5 bits per value */ {
|
||||||
0x40300c0108801005L,0x80202a8208801000L,0x4021006280a0288dL,0x30482184802d8414L,
|
0x380e014a051404L,0xe28245009451140L,0x8a26880098a6268cL,0x180a288ca0246213L,
|
||||||
0x5990240880010460L,0x191a28118330900L,0x310c413204c1104L,0x8625084811c4710dL,
|
0x494053284a1080e1L,0x510265a89c311940L,0x4218c41188a6509cL,0x6340c4211c4710dL,
|
||||||
0xa92a398e2188231aL,0x104e351c4a508ca4L,0x21208511c8341483L,0xe6290620946a1910L,
|
0xa168398471882a12L,0x104c841c683a0425L,0x3294472904351483L,0xe6290620a84a20d0L,
|
||||||
0xd47221423216a4a0L,0x28L
|
0x1441a0ea2896a4a0L,0x32L
|
||||||
};
|
};
|
||||||
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
|
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
|
||||||
0x33300030c2000800L,0x32828088800c3cfL,0x5555550cace32320L,0x5555555555555555L,
|
0x33300230c0000800L,0x220ca080a00fc330L,0x555555f832823380L,0x5555555555555555L,
|
||||||
0x5555555555555555L,0x5555L
|
0x5555555555555555L,0x5555L
|
||||||
};
|
};
|
||||||
|
|
||||||
// 16 vectors; 30 states per vector; array length = 480
|
// 16 vectors; 30 states per vector; array length = 480
|
||||||
private final static long[] toStates4 = new long[] /*5 bits per value */ {
|
private final static long[] toStates4 = new long[] /*5 bits per value */ {
|
||||||
0x80300c0108801005L,0x88210802000L,0x44200401400000L,0x7ae3b88621185c07L,
|
0x380e014a051404L,0xaa015452940L,0x55014501000000L,0x1843ddc771085c07L,
|
||||||
0x101500042100404L,0x20803140501446cL,0x40100420006c2122L,0x490140511b004054L,
|
0x7141200040108405L,0x52b44004c5313460L,0x401080200063115cL,0x85314c4d181c5048L,
|
||||||
0x8401f2e3c086411L,0x120861200b100822L,0x641102400081180cL,0x4802c40100001088L,
|
0x1440190a3e5c7828L,0x28a232809100a21L,0xa028ca2a84203846L,0xca0240010800108aL,
|
||||||
0x8c21195607048418L,0x1421014245bc3f2L,0x23450230661200b1L,0x2108664118240803L,
|
0xc7b4205c1580a508L,0x1021090251846b6L,0x4cb513862328090L,0x210863128ca2b8a2L,
|
||||||
0x8c1984802c802004L,0xbc3e28c41150d140L,0xc4120102209421dL,0x7884c11c4710d031L,
|
0x4e188ca024402940L,0xa6b6c7c520532d4L,0x8c41101451150219L,0xa0c4211c4710d421L,
|
||||||
0x210842109031bc62L,0xd21484360c431044L,0x9c265293a3a6e741L,0x1cc710c41109ce70L,
|
0x2108421094e15063L,0x8f13c43708631044L,0x18274d908c611631L,0x1cc238c411098263L,
|
||||||
0x1bce27a846525495L,0x3105425094a108c7L,0x6f735e95254731c4L,0x9ee7a9c234a9393aL,
|
0x450e3a1d0212d0b4L,0x31050242048108c6L,0xfa318b42d07308eL,0xa8865182356907c6L,
|
||||||
0x144720d0520c4150L,0x211051bc646084c2L,0x3614831048220842L,0x93a460e742351488L,
|
0x1ca410d4520c4140L,0x2954e13883a0ca51L,0x3714831044229442L,0x93946116b58f2c84L,
|
||||||
0xc4120a2e70a24656L,0x284642d4941cc520L,0x4094a210c51bce46L,0xb525073148310502L,
|
0xc41109a5631a574dL,0x1d4512d4941cc520L,0x52848294c643883aL,0xb525073148310502L,
|
||||||
0x24356939460f7358L,0x4098e7aaL
|
0xa5356939460f7358L,0x409ca651L
|
||||||
};
|
};
|
||||||
private final static long[] offsetIncrs4 = new long[] /*3 bits per value */ {
|
private final static long[] offsetIncrs4 = new long[] /*3 bits per value */ {
|
||||||
0xc0602000010000L,0xa000040000000001L,0x248204041248L,0xb0180c06c3618618L,
|
0x20c0600000010000L,0x2000040000000001L,0x209204a40209L,0x301b6c0618018618L,
|
||||||
0x238d861860001861L,0x41040061c6e06041L,0x4004900c2402400L,0x409489001041001L,
|
0x207206186000186cL,0x1200061b8e06dc0L,0x480492080612010L,0xa20204a040048000L,
|
||||||
0x4184184004148124L,0x1041b4980c24c3L,0xd26040938d061061L,0x2492492492494146L,
|
0x1061a0000129124L,0x1848349b680612L,0xd26da0204a041868L,0x2492492492496128L,
|
||||||
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
||||||
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
||||||
0x2492492492492492L,0x9249249249249249L,0x24924924L
|
0x2492492492492492L,0x9249249249249249L,0x24924924L
|
||||||
|
@ -143,33 +143,33 @@ class Lev2ParametricDescription extends ParametricDescription {
|
||||||
|
|
||||||
// 32 vectors; 30 states per vector; array length = 960
|
// 32 vectors; 30 states per vector; array length = 960
|
||||||
private final static long[] toStates5 = new long[] /*5 bits per value */ {
|
private final static long[] toStates5 = new long[] /*5 bits per value */ {
|
||||||
0x80300c0108801005L,0x88210802000L,0x42200401400000L,0xa088201000300c03L,
|
0x380e014a051404L,0xaa015452940L,0x8052814501000000L,0xb80a515450000e03L,
|
||||||
0x100510842108428L,0x2188461701c01108L,0x108401011eb8eeL,0x85c0700442004014L,
|
0x5140410842108426L,0x71dc421701c01540L,0x100421014610f7L,0x85c0700550145010L,
|
||||||
0x88267ae3b886211L,0x1446c01015108842L,0xc212202080314050L,0x405440100420006L,
|
0x94a271843ddc7710L,0x1346071412108a22L,0x3115c52b44004c53L,0xc504840108020006L,
|
||||||
0x10201c50140511b0L,0x942528423b08888L,0x240501446c010155L,0x21007cb8f0219045L,
|
0x54d1001314c4d181L,0x9081204239c4a71L,0x14c5313460714124L,0x51006428f971e0a2L,
|
||||||
0x511b004054402088L,0x2e3c086411490140L,0x200b50904428823fL,0x400081180c120861L,
|
0x4d181c5048402884L,0xa3e5c782885314cL,0x2809409482a8a239L,0x2a84203846028a23L,
|
||||||
0x100001088641102L,0x46030482184802c4L,0x9ce8990840980030L,0x21061200b709c210L,
|
0x10800108aa028caL,0xe1180a288ca0240L,0x98c6b80e3294a108L,0x2942328091098c10L,
|
||||||
0xf0fca308465581c1L,0x802c405084050916L,0xc211956070484184L,0x9e4209ee65bc3f28L,
|
0x11adb1ed08170560L,0xa024004084240946L,0x7b4205c1580a508cL,0xa8c2968c71846b6cL,
|
||||||
0x3450230661200b70L,0x1086641182408032L,0xc1984802c8020042L,0x86098201c8d1408L,
|
0x4cb5138623280910L,0x10863128ca2b8a20L,0xe188ca0244029402L,0x4e3294e288132d44L,
|
||||||
0xb88a22529ce399L,0x1045434502306612L,0x4088250876f0f8a3L,0xd1408c1984802c80L,
|
0x809409ad1218c39cL,0xf14814cb51386232L,0x514454086429adb1L,0x32d44e188ca02440L,
|
||||||
0xee3dbc3e28c41150L,0xd0310c4188984429L,0xbc627884c11c4710L,0x1044210842109031L,
|
0x8c390a6b6c7c5205L,0xd4218c41409cd2aaL,0x5063a0c4211c4710L,0x10442108421094e1L,
|
||||||
0x21704711c4340c43L,0xbdef7bdf0c7a18b4L,0x85210d8310c41ef7L,0x994a4e8e9b9d074L,
|
0x31084711c4350863L,0xbdef7bddf05918f2L,0xc4f10dc218c41ef7L,0x9d3642318458c63L,
|
||||||
0x60c4310442739c27L,0x3a3a6e741d214843L,0x41ef77bdf77de529L,0x8465254951cc710cL,
|
0x70863104426098c6L,0x8c6116318f13c43L,0x41ef75dd6b5de4d9L,0xd0212d0b41cc238cL,
|
||||||
0x94a108c71bce27aL,0x5254731c43105425L,0xdb1c7a38b4a15949L,0xc710c41cf73dce7bL,
|
0x2048108c6450e3a1L,0x42d07308e3105024L,0xdb591938f274084bL,0xc238c41f77deefbbL,
|
||||||
0xe4e9bdcd7a54951cL,0x5427b9ea708d2a4L,0x735e95254731c431L,0xbd677db4a9393a6fL,
|
0x1f183e8c62d0b41cL,0x502a2194608d5a4L,0xa318b42d07308e31L,0xed675db56907c60fL,
|
||||||
0x4720d0520c41cf75L,0x1051bc646084c214L,0x1483104822084221L,0x193821708511c834L,
|
0xa410d4520c41f773L,0x54e13883a0ca511cL,0x1483104422944229L,0x20f2329447290435L,
|
||||||
0x1bf6fdef6f7f147aL,0xd08d45220d8520c4L,0x9c289195a4e91839L,0x488361483104828bL,
|
0x1ef6f7ef6f7df05cL,0xad63cb210dc520c4L,0x58c695d364e51845L,0xc843714831044269L,
|
||||||
0xe5693a460e742351L,0x520c41bf71bdf717L,0xe46284642d4941ccL,0x5024094a210c51bcL,
|
0xe4d93946116b58f2L,0x520c41ef717d6b17L,0x83a1d4512d4941ccL,0x50252848294c6438L,
|
||||||
0x590b525073148310L,0xce6f7b147a3938a1L,0x941cc520c41f77ddL,0xd5a4e5183dcd62d4L,
|
0x144b525073148310L,0xefaf7b591c20f275L,0x941cc520c41f777bL,0xd5a4e5183dcd62d4L,
|
||||||
0x48310502639ea890L,0x460f7358b5250731L,0xf779bd6717b56939L
|
0x4831050272994694L,0x460f7358b5250731L,0xf779bd6717b56939L
|
||||||
};
|
};
|
||||||
private final static long[] offsetIncrs5 = new long[] /*3 bits per value */ {
|
private final static long[] offsetIncrs5 = new long[] /*3 bits per value */ {
|
||||||
0xc0602000010000L,0x8000040000000001L,0xb6db6d4030180L,0x810104922800010L,
|
0x20c0600000010000L,0x40000000001L,0xb6db6d4830180L,0x4812900824800010L,
|
||||||
0x248a000040000092L,0x618000b649654041L,0x861b0180c06c3618L,0x301b0d861860001L,
|
0x2092000040000082L,0x618000b659254a40L,0x86c301b6c0618018L,0xdb01860061860001L,
|
||||||
0x61861800075d6ed6L,0x1871b8181048e3L,0xe56041238d861860L,0x40240041040075c6L,
|
0x81861800075baed6L,0x186e381b70081cL,0xe56dc02072061860L,0x61201001200075b8L,
|
||||||
0x4100104004900c2L,0x55b5240309009001L,0x1025224004104005L,0x10410010520490L,
|
0x480000480492080L,0x52b5248201848040L,0x880812810012000bL,0x4004800004a4492L,
|
||||||
0x55495240409489L,0x4980c24c34184184L,0x30d061061001041bL,0x184005556d260309L,
|
0xb529124a20204aL,0x49b68061201061a0L,0x8480418680018483L,0x1a000752ad26da01L,
|
||||||
0x51b4981024e34184L,0x40938d0610610010L,0x492492495546d260L,0x2492492492492492L,
|
0x4a349b6808128106L,0xa0204a0418680018L,0x492492497528d26dL,0x2492492492492492L,
|
||||||
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
||||||
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
||||||
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
|
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
|
||||||
|
@ -182,36 +182,36 @@ class Lev2ParametricDescription extends ParametricDescription {
|
||||||
// 0 -> [(0, 0)]
|
// 0 -> [(0, 0)]
|
||||||
// 1 -> [(0, 2)]
|
// 1 -> [(0, 2)]
|
||||||
// 2 -> [(0, 1)]
|
// 2 -> [(0, 1)]
|
||||||
// 3 -> [(0, 2), (1, 2)]
|
// 3 -> [(0, 1), (1, 1)]
|
||||||
// 4 -> [(0, 1), (1, 1)]
|
// 4 -> [(0, 2), (1, 2)]
|
||||||
// 5 -> [(0, 2), (2, 1)]
|
// 5 -> [(0, 2), (2, 1)]
|
||||||
// 6 -> [(0, 1), (2, 2)]
|
// 6 -> [(0, 1), (2, 2)]
|
||||||
// 7 -> [(0, 2), (1, 2), (2, 2)]
|
// 7 -> [(0, 2), (2, 2)]
|
||||||
// 8 -> [(0, 1), (2, 1)]
|
// 8 -> [(0, 1), (1, 1), (2, 1)]
|
||||||
// 9 -> [(0, 2), (2, 2)]
|
// 9 -> [(0, 2), (1, 2), (2, 2)]
|
||||||
// 10 -> [(0, 1), (1, 1), (2, 1)]
|
// 10 -> [(0, 1), (2, 1)]
|
||||||
// 11 -> [(0, 2), (1, 2), (2, 2), (3, 2)]
|
// 11 -> [(0, 2), (3, 2)]
|
||||||
// 12 -> [(0, 2), (2, 1), (3, 1)]
|
// 12 -> [(0, 2), (1, 2), (3, 2)]
|
||||||
// 13 -> [(0, 2), (3, 2)]
|
// 13 -> [(0, 2), (1, 2), (2, 2), (3, 2)]
|
||||||
// 14 -> [(0, 2), (2, 2), (3, 2)]
|
// 14 -> [(0, 1), (2, 2), (3, 2)]
|
||||||
// 15 -> [(0, 2), (1, 2), (3, 1)]
|
// 15 -> [(0, 2), (3, 1)]
|
||||||
// 16 -> [(0, 2), (1, 2), (3, 2)]
|
// 16 -> [(0, 1), (3, 2)]
|
||||||
// 17 -> [(0, 1), (2, 2), (3, 2)]
|
// 17 -> [(0, 1), (1, 1), (3, 2)]
|
||||||
// 18 -> [(0, 2), (3, 1)]
|
// 18 -> [(0, 2), (1, 2), (3, 1)]
|
||||||
// 19 -> [(0, 1), (3, 2)]
|
// 19 -> [(0, 2), (2, 2), (3, 2)]
|
||||||
// 20 -> [(0, 1), (1, 1), (3, 2)]
|
// 20 -> [(0, 2), (2, 1), (3, 1)]
|
||||||
// 21 -> [(0, 2), (2, 1), (4, 2)]
|
// 21 -> [(0, 2), (2, 1), (4, 2)]
|
||||||
// 22 -> [(0, 2), (1, 2), (4, 2)]
|
// 22 -> [(0, 2), (1, 2), (4, 2)]
|
||||||
// 23 -> [(0, 2), (1, 2), (3, 2), (4, 2)]
|
// 23 -> [(0, 2), (1, 2), (3, 2), (4, 2)]
|
||||||
// 24 -> [(0, 2), (2, 2), (4, 2)]
|
// 24 -> [(0, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
// 25 -> [(0, 2), (2, 2), (3, 2), (4, 2)]
|
// 25 -> [(0, 2), (3, 2), (4, 2)]
|
||||||
// 26 -> [(0, 2), (3, 2), (4, 2)]
|
// 26 -> [(0, 2), (1, 2), (2, 2), (4, 2)]
|
||||||
// 27 -> [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
|
// 27 -> [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
// 28 -> [(0, 2), (4, 2)]
|
// 28 -> [(0, 2), (4, 2)]
|
||||||
// 29 -> [(0, 2), (1, 2), (2, 2), (4, 2)]
|
// 29 -> [(0, 2), (2, 2), (4, 2)]
|
||||||
|
|
||||||
|
|
||||||
public Lev2ParametricDescription(int w) {
|
public Lev2ParametricDescription(int w) {
|
||||||
super(w, 2, new int[] {0,2,1,1,0,-1,0,0,-1,0,-1,-1,-2,-1,-1,-2,-1,-1,-2,-1,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2});
|
super(w, 2, new int[] {0,2,1,0,1,-1,0,0,-1,0,-1,-1,-1,-1,-1,-2,-1,-1,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,264 @@
|
||||||
|
package org.apache.lucene.util.automaton;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// The following code was generated with the moman/finenight pkg
|
||||||
|
// This package is available under the MIT License, see NOTICE.txt
|
||||||
|
// for more details.
|
||||||
|
|
||||||
|
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
|
||||||
|
|
||||||
|
/** Parametric description for generating a Levenshtein automaton of degree 2,
|
||||||
|
with transpositions as primitive edits */
|
||||||
|
class Lev2TParametricDescription extends ParametricDescription {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int transition(int absState, int position, int vector) {
|
||||||
|
// null absState should never be passed in
|
||||||
|
assert absState != -1;
|
||||||
|
|
||||||
|
// decode absState -> state, offset
|
||||||
|
int state = absState/(w+1);
|
||||||
|
int offset = absState%(w+1);
|
||||||
|
assert offset >= 0;
|
||||||
|
|
||||||
|
if (position == w) {
|
||||||
|
if (state < 3) {
|
||||||
|
final int loc = vector * 3 + state;
|
||||||
|
offset += unpack(offsetIncrs0, loc, 1);
|
||||||
|
state = unpack(toStates0, loc, 2)-1;
|
||||||
|
}
|
||||||
|
} else if (position == w-1) {
|
||||||
|
if (state < 5) {
|
||||||
|
final int loc = vector * 5 + state;
|
||||||
|
offset += unpack(offsetIncrs1, loc, 1);
|
||||||
|
state = unpack(toStates1, loc, 3)-1;
|
||||||
|
}
|
||||||
|
} else if (position == w-2) {
|
||||||
|
if (state < 13) {
|
||||||
|
final int loc = vector * 13 + state;
|
||||||
|
offset += unpack(offsetIncrs2, loc, 2);
|
||||||
|
state = unpack(toStates2, loc, 4)-1;
|
||||||
|
}
|
||||||
|
} else if (position == w-3) {
|
||||||
|
if (state < 28) {
|
||||||
|
final int loc = vector * 28 + state;
|
||||||
|
offset += unpack(offsetIncrs3, loc, 2);
|
||||||
|
state = unpack(toStates3, loc, 5)-1;
|
||||||
|
}
|
||||||
|
} else if (position == w-4) {
|
||||||
|
if (state < 45) {
|
||||||
|
final int loc = vector * 45 + state;
|
||||||
|
offset += unpack(offsetIncrs4, loc, 3);
|
||||||
|
state = unpack(toStates4, loc, 6)-1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (state < 45) {
|
||||||
|
final int loc = vector * 45 + state;
|
||||||
|
offset += unpack(offsetIncrs5, loc, 3);
|
||||||
|
state = unpack(toStates5, loc, 6)-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state == -1) {
|
||||||
|
// null state
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
// translate back to abs
|
||||||
|
return state*(w+1)+offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1 vectors; 3 states per vector; array length = 3
|
||||||
|
private final static long[] toStates0 = new long[] /*2 bits per value */ {
|
||||||
|
0x23L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ {
|
||||||
|
0x0L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 2 vectors; 5 states per vector; array length = 10
|
||||||
|
private final static long[] toStates1 = new long[] /*3 bits per value */ {
|
||||||
|
0x13688b44L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
|
||||||
|
0x3e0L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 4 vectors; 13 states per vector; array length = 52
|
||||||
|
private final static long[] toStates2 = new long[] /*4 bits per value */ {
|
||||||
|
0x60dbb0b05200b504L,0x5233217627062227L,0x2355543214323235L,0x4354L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
|
||||||
|
0x555080a800002000L,0x5555555555L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 8 vectors; 28 states per vector; array length = 224
|
||||||
|
private final static long[] toStates3 = new long[] /*5 bits per value */ {
|
||||||
|
0xe701c02940059404L,0xa010162000a50000L,0xb02c8c40a1416288L,0xa821032310858c0L,
|
||||||
|
0x314423980d28b201L,0x5281e528847788e0L,0xa23980d308c2280eL,0x1e3294b1a962278cL,
|
||||||
|
0x8c41309e2288e528L,0x11444409021aca21L,0x11a4624886b1086bL,0x2a6258941d6240c4L,
|
||||||
|
0x5024a50b489074adL,0x14821aca520c411aL,0x5888b5890b594a44L,0x941d6520c411a465L,
|
||||||
|
0x8b589075ad6a62d4L,0x1a5055a4L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
|
||||||
|
0x30c30200002000L,0x2a0030f3c3fc333cL,0x233a00328282a820L,0x5555555532b283a8L,
|
||||||
|
0x5555555555555555L,0x5555555555555555L,0x5555555555555555L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 16 vectors; 45 states per vector; array length = 720
|
||||||
|
private final static long[] toStates4 = new long[] /*6 bits per value */ {
|
||||||
|
0x3801450002c5004L,0xc500014b00000e38L,0x51451401402L,0x0L,
|
||||||
|
0x518000b14010000L,0x9f1c20828e20230L,0x219f0df0830a70c2L,0x8200008208208200L,
|
||||||
|
0x805050160800800L,0x3082098602602643L,0x4564014250508064L,0x850051420000831L,
|
||||||
|
0x4140582085002082L,0x456180980990c201L,0x8316d0c50a01051L,0x21451420050df0e0L,
|
||||||
|
0xd14214014508214L,0x3c21c01850821c60L,0x1cb1403cb142087L,0x800821451851822cL,
|
||||||
|
0x20020820800020L,0xd006182087180345L,0xcb0a81cb24976b09L,0x8b1a60e624709d1L,
|
||||||
|
0x249082082249089L,0xc31421c600d2c024L,0x3c31451515454423L,0x31853c22c21cb140L,
|
||||||
|
0x4514500b2c208214L,0x8718034508b0051L,0xb2cb45515108f0c5L,0xe824715d1cb0a810L,
|
||||||
|
0x1422cb14908b0e60L,0x30812c22c02cb145L,0x842022020cb1420cL,0x5c20ce0820ce0850L,
|
||||||
|
0x208208208b0d70c2L,0x4208508214214208L,0x920834050830c20L,0xc6134dc613653592L,
|
||||||
|
0xd309341c6dc4db4dL,0x6424d90854d34d34L,0x92072c22030814c2L,0x4220724b24a30930L,
|
||||||
|
0x2470d72025c920e2L,0x92c92d70975c9082L,0xcb0880c204924e08L,0x45739728c24c2481L,
|
||||||
|
0xc6da4db5da6174daL,0x4b5d35d75d30971dL,0x1030815c93825ce2L,0x51442051020cb145L,
|
||||||
|
0xc538210e2c220e2cL,0x851421452cb0d70L,0x204b085085145142L,0x921560834051440cL,
|
||||||
|
0x4d660e4da60e6595L,0x94d914e41c6dc658L,0x826426591454d365L,0x2892072c51030813L,
|
||||||
|
0xe2c22072cb2ca30bL,0x452c70d720538910L,0x8b2cb2d708e3891L,0x81cb1440c204b24eL,
|
||||||
|
0xda44e38e28c2ca24L,0x1dc6da6585d660e4L,0xe2cb5d338e5d914eL,0x38938238L
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs4 = new long[] /*3 bits per value */ {
|
||||||
|
0x3002000000080000L,0x20c060L,0x8149000004000000L,0x4024924110824824L,
|
||||||
|
0xdb6030c360002082L,0x6c36c06c301b0d80L,0xb01861b0000db0dbL,0x1b7036209188e06dL,
|
||||||
|
0x800920006d86db7L,0x4920c2402402490L,0x49000208249009L,0x4908128128124804L,
|
||||||
|
0x34800104124a44a2L,0xc30930900d24020cL,0x40009a0924c24d24L,0x4984a069201061aL,
|
||||||
|
0x494d049271269262L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
||||||
|
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
|
||||||
|
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
||||||
|
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
||||||
|
0x2492492492492492L,0x249249249249L
|
||||||
|
};
|
||||||
|
|
||||||
|
// 32 vectors; 45 states per vector; array length = 1440
|
||||||
|
private final static long[] toStates5 = new long[] /*6 bits per value */ {
|
||||||
|
0x3801450002c5004L,0xc500014b00000e38L,0x51451401402L,0x0L,
|
||||||
|
0x514000b14010000L,0x550000038e00e0L,0x264518500600b180L,0x8208208208208208L,
|
||||||
|
0x2c50040820820L,0x70820a38808c0146L,0xc37c20c29c30827cL,0x20820820800867L,
|
||||||
|
0xb140102002002080L,0x828e202300518000L,0x830a70c209f1c20L,0x51451450853df0dfL,
|
||||||
|
0x1614214214508214L,0x6026026430805050L,0x2505080643082098L,0x4200008314564014L,
|
||||||
|
0x850020820850051L,0x80990c2014140582L,0x8201920208261809L,0x892051990060941L,
|
||||||
|
0x22492492c22cb242L,0x430805050162492cL,0x8041451586026026L,0x37c38020c5b43142L,
|
||||||
|
0x4208508514508014L,0x141405850850051L,0x51456180980990c2L,0xe008316d0c50a010L,
|
||||||
|
0x2c52cb2c508b21f0L,0x600d2c92c22cb249L,0x873c21c01850821cL,0x2c01cb1403cb1420L,
|
||||||
|
0x2080082145185182L,0x4500200208208000L,0x870061420871803L,0x740500f5050821cfL,
|
||||||
|
0x934d964618609000L,0x4c24d34d30824d30L,0x1860821c600d642L,0xc2a072c925dac274L,
|
||||||
|
0x2c69839891c27472L,0x9242082089242242L,0x8208718034b00900L,0x1cb24976b09d0061L,
|
||||||
|
0x60e624709d1cb0a8L,0xd31455d71574ce3eL,0x1c600d3825c25d74L,0x51515454423c3142L,
|
||||||
|
0xc22c21cb1403c314L,0xb2c20821431853L,0x34508b005145145L,0x5515108f0c508718L,
|
||||||
|
0x8740500f2051454L,0xe2534d920618f090L,0x493826596592c238L,0x4423c31421c600d6L,
|
||||||
|
0x72c2a042cb2d1545L,0x422c3983a091c574L,0xb2c514508b2c52L,0xf0c508718034b08bL,
|
||||||
|
0xa810b2cb45515108L,0x2260e824715d1cb0L,0xe6592c538e2d74ceL,0x420c308138938238L,
|
||||||
|
0x850842022020cb1L,0x70c25c20ce0820ceL,0x4208208208208b0dL,0xc20420850821421L,
|
||||||
|
0x21080880832c5083L,0xa50838820838c214L,0xaaaaaaaaa9c39430L,0x1aaa7eaa9fa9faaaL,
|
||||||
|
0x824820d01420c308L,0x7184d37184d94d64L,0x34c24d071b7136d3L,0x990936421534d34dL,
|
||||||
|
0x834050830c20530L,0x34dc613653592092L,0xa479c6dc4db4dc61L,0x920a9f924924924aL,
|
||||||
|
0x72c220308192a82aL,0x724b24a30930920L,0xd72025c920e2422L,0x92d70975c9082247L,
|
||||||
|
0x880c204924e0892cL,0x2c928c24c2481cb0L,0x80a5248889088749L,0x6a861b2aaac74394L,
|
||||||
|
0x81b2ca6ab27b278L,0xa3093092072c2203L,0xd76985d36915ce5cL,0x5d74c25c771b6936L,
|
||||||
|
0x724e0973892d74d7L,0x4c2481cb0880c205L,0x6174da45739728c2L,0x4aa175c6da4db5daL,
|
||||||
|
0x6a869b2786486186L,0xcb14510308186caL,0x220e2c5144205102L,0xcb0d70c538210e2cL,
|
||||||
|
0x1451420851421452L,0x51440c204b085085L,0xcb1451081440832cL,0x94316208488b0888L,
|
||||||
|
0xfaaa7dfa9f7e79c3L,0x30819ea7ea7df7dL,0x6564855820d01451L,0x9613598393698399L,
|
||||||
|
0xd965364539071b71L,0x4e0990996451534L,0x21560834051440c2L,0xd660e4da60e65959L,
|
||||||
|
0x9207e979c6dc6584L,0xa82a8207df924820L,0x892072c5103081a6L,0x2c22072cb2ca30b2L,
|
||||||
|
0x52c70d720538910eL,0x8b2cb2d708e38914L,0x1cb1440c204b24e0L,0x874b2cb28c2ca248L,
|
||||||
|
0x4394816224488b08L,0x9e786aa69b1f7e77L,0x51030819eca6a9e7L,0x8e38a30b2892072cL,
|
||||||
|
0x6996175983936913L,0x74ce39764538771bL,0xc204e24e08e38b2dL,0x28c2ca2481cb1440L,
|
||||||
|
0x85d660e4da44e38eL,0x698607e975c6da65L,0xa6ca6aa699e7864aL
|
||||||
|
};
|
||||||
|
private final static long[] offsetIncrs5 = new long[] /*3 bits per value */ {
|
||||||
|
0x3002000000080000L,0x20c060L,0x100000004000000L,0xdb6db6db50603018L,
|
||||||
|
0xa480000200002db6L,0x1249208841241240L,0x4000010000104120L,0x2492c42092092052L,
|
||||||
|
0xc30d800096592d9L,0xb01b0c06c36036d8L,0x186c00036c36db0dL,0xad860361b01b6c06L,
|
||||||
|
0x360001b75b6dd6ddL,0xc412311c0db6030cL,0xdb0db6e36e06L,0x9188e06db01861bL,
|
||||||
|
0x6dd6db71b72b62L,0x4024024900800920L,0x20824900904920c2L,0x1201248040049000L,
|
||||||
|
0x5524ad4aa4906120L,0x4092402002480015L,0x9252251248409409L,0x4920100124000820L,
|
||||||
|
0x29128924204a04a0L,0x900830d200055549L,0x934930c24c24034L,0x418690002682493L,
|
||||||
|
0x9a49861261201a48L,0xc348001355249d4L,0x24c40930940d2402L,0x1a40009a0924e24dL,
|
||||||
|
0x6204984a06920106L,0x92494d5492712692L,0x4924924924924924L,0x2492492492492492L,
|
||||||
|
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
||||||
|
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
||||||
|
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
|
||||||
|
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
||||||
|
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
|
||||||
|
0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
|
||||||
|
0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
|
||||||
|
0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x24924924L
|
||||||
|
};
|
||||||
|
|
||||||
|
// state map
|
||||||
|
// 0 -> [(0, 0)]
|
||||||
|
// 1 -> [(0, 2)]
|
||||||
|
// 2 -> [(0, 1)]
|
||||||
|
// 3 -> [(0, 1), (1, 1)]
|
||||||
|
// 4 -> [(0, 2), (1, 2)]
|
||||||
|
// 5 -> [t(0, 2), (0, 2), (1, 2), (2, 2)]
|
||||||
|
// 6 -> [(0, 2), (2, 1)]
|
||||||
|
// 7 -> [(0, 1), (2, 2)]
|
||||||
|
// 8 -> [(0, 2), (2, 2)]
|
||||||
|
// 9 -> [(0, 1), (1, 1), (2, 1)]
|
||||||
|
// 10 -> [(0, 2), (1, 2), (2, 2)]
|
||||||
|
// 11 -> [(0, 1), (2, 1)]
|
||||||
|
// 12 -> [t(0, 1), (0, 1), (1, 1), (2, 1)]
|
||||||
|
// 13 -> [(0, 2), (1, 2), (2, 2), (3, 2)]
|
||||||
|
// 14 -> [t(0, 2), (0, 2), (1, 2), (2, 2), (3, 2)]
|
||||||
|
// 15 -> [(0, 2), t(1, 2), (1, 2), (2, 2), (3, 2)]
|
||||||
|
// 16 -> [(0, 2), (2, 1), (3, 1)]
|
||||||
|
// 17 -> [(0, 1), t(1, 2), (2, 2), (3, 2)]
|
||||||
|
// 18 -> [(0, 2), (3, 2)]
|
||||||
|
// 19 -> [(0, 2), (1, 2), t(1, 2), (2, 2), (3, 2)]
|
||||||
|
// 20 -> [t(0, 2), (0, 2), (1, 2), (3, 1)]
|
||||||
|
// 21 -> [(0, 1), (1, 1), (3, 2)]
|
||||||
|
// 22 -> [(0, 2), (2, 2), (3, 2)]
|
||||||
|
// 23 -> [(0, 2), (1, 2), (3, 1)]
|
||||||
|
// 24 -> [(0, 2), (1, 2), (3, 2)]
|
||||||
|
// 25 -> [(0, 1), (2, 2), (3, 2)]
|
||||||
|
// 26 -> [(0, 2), (3, 1)]
|
||||||
|
// 27 -> [(0, 1), (3, 2)]
|
||||||
|
// 28 -> [(0, 2), (2, 1), (4, 2)]
|
||||||
|
// 29 -> [(0, 2), t(1, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
|
// 30 -> [(0, 2), (1, 2), (4, 2)]
|
||||||
|
// 31 -> [(0, 2), (1, 2), (3, 2), (4, 2)]
|
||||||
|
// 32 -> [(0, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
|
// 33 -> [(0, 2), (1, 2), t(2, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
|
// 34 -> [(0, 2), (1, 2), (2, 2), t(2, 2), (3, 2), (4, 2)]
|
||||||
|
// 35 -> [(0, 2), (3, 2), (4, 2)]
|
||||||
|
// 36 -> [(0, 2), t(2, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
|
// 37 -> [t(0, 2), (0, 2), (1, 2), (2, 2), (4, 2)]
|
||||||
|
// 38 -> [(0, 2), (1, 2), (2, 2), (4, 2)]
|
||||||
|
// 39 -> [t(0, 2), (0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
|
// 40 -> [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
|
// 41 -> [(0, 2), (4, 2)]
|
||||||
|
// 42 -> [t(0, 2), (0, 2), (1, 2), (2, 2), t(2, 2), (3, 2), (4, 2)]
|
||||||
|
// 43 -> [(0, 2), (2, 2), (4, 2)]
|
||||||
|
// 44 -> [(0, 2), (1, 2), t(1, 2), (2, 2), (3, 2), (4, 2)]
|
||||||
|
|
||||||
|
|
||||||
|
public Lev2TParametricDescription(int w) {
|
||||||
|
super(w, 2, new int[] {0,2,1,0,1,0,-1,0,0,-1,0,-1,-1,-1,-1,-1,-2,-1,-1,-1,-2,-1,-1,-2,-1,-1,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2});
|
||||||
|
}
|
||||||
|
}
|
|
@ -47,8 +47,9 @@ public class LevenshteinAutomata {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new LevenshteinAutomata for some input String.
|
* Create a new LevenshteinAutomata for some input String.
|
||||||
|
* Optionally count transpositions as a primitive edit.
|
||||||
*/
|
*/
|
||||||
public LevenshteinAutomata(String input) {
|
public LevenshteinAutomata(String input, boolean withTranspositions) {
|
||||||
this.input = input;
|
this.input = input;
|
||||||
int length = Character.codePointCount(input, 0, input.length());
|
int length = Character.codePointCount(input, 0, input.length());
|
||||||
word = new int[length];
|
word = new int[length];
|
||||||
|
@ -88,8 +89,8 @@ public class LevenshteinAutomata {
|
||||||
|
|
||||||
descriptions = new ParametricDescription[] {
|
descriptions = new ParametricDescription[] {
|
||||||
null, /* for n=0, we do not need to go through the trouble */
|
null, /* for n=0, we do not need to go through the trouble */
|
||||||
new Lev1ParametricDescription(word.length),
|
withTranspositions ? new Lev1TParametricDescription(word.length) : new Lev1ParametricDescription(word.length),
|
||||||
new Lev2ParametricDescription(word.length),
|
withTranspositions ? new Lev2TParametricDescription(word.length) : new Lev2ParametricDescription(word.length),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
# Note, this file is known to work with rev 115 of the moman
|
# Note, this file is known to work with rev 120 of the moman
|
||||||
# repository (http://bitbucket.org/jpbarrette/moman/overview)
|
# repository (http://bitbucket.org/jpbarrette/moman/overview)
|
||||||
#
|
#
|
||||||
# See also: http://sites.google.com/site/rrettesite/moman
|
# See also: http://sites.google.com/site/rrettesite/moman
|
||||||
|
@ -95,9 +95,9 @@ def charVarNumber(charVar):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 3:
|
||||||
print
|
print
|
||||||
print 'Usage: python -u %s N' % sys.argv[0]
|
print 'Usage: python -u %s N <True/False>' % sys.argv[0]
|
||||||
print
|
print
|
||||||
print 'NOTE: the resulting .java file is created in the current working dir!'
|
print 'NOTE: the resulting .java file is created in the current working dir!'
|
||||||
print
|
print
|
||||||
|
@ -105,7 +105,9 @@ def main():
|
||||||
|
|
||||||
n = int(sys.argv[1])
|
n = int(sys.argv[1])
|
||||||
|
|
||||||
tables = genTransitions(n)
|
transpose = (sys.argv[2] == "True")
|
||||||
|
|
||||||
|
tables = genTransitions(n, transpose)
|
||||||
|
|
||||||
stateMap = {}
|
stateMap = {}
|
||||||
|
|
||||||
|
@ -142,6 +144,11 @@ def main():
|
||||||
w('')
|
w('')
|
||||||
w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
|
w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
|
||||||
w('')
|
w('')
|
||||||
|
if transpose:
|
||||||
|
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
|
||||||
|
w(' with transpositions as primitive edits */')
|
||||||
|
className = 'Lev%dTParametricDescription' % n
|
||||||
|
else:
|
||||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
|
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
|
||||||
className = 'Lev%dParametricDescription' % n
|
className = 'Lev%dParametricDescription' % n
|
||||||
|
|
||||||
|
@ -201,9 +208,6 @@ def main():
|
||||||
byAction = {}
|
byAction = {}
|
||||||
for s, (toS, offset) in l:
|
for s, (toS, offset) in l:
|
||||||
state = str(s)
|
state = str(s)
|
||||||
if state == '[]':
|
|
||||||
# don't waste code on the null state
|
|
||||||
continue
|
|
||||||
|
|
||||||
toState = str(toS)
|
toState = str(toS)
|
||||||
if state not in stateMap:
|
if state not in stateMap:
|
||||||
|
@ -213,7 +217,7 @@ def main():
|
||||||
|
|
||||||
byFromState[stateMap[state]] = (1+stateMap[toState], offset)
|
byFromState[stateMap[state]] = (1+stateMap[toState], offset)
|
||||||
|
|
||||||
fromStateDesc = ', '.join([str(x) for x in eval(s)])
|
fromStateDesc = s[1:len(s)-1]
|
||||||
toStateDesc = ', '.join([str(x) for x in toS])
|
toStateDesc = ', '.join([str(x) for x in toS])
|
||||||
|
|
||||||
tup = (stateMap[toState], toStateDesc, offset)
|
tup = (stateMap[toState], toStateDesc, offset)
|
||||||
|
@ -222,10 +226,10 @@ def main():
|
||||||
byAction[tup].append((fromStateDesc, stateMap[state]))
|
byAction[tup].append((fromStateDesc, stateMap[state]))
|
||||||
|
|
||||||
if numCasesPerVector is None:
|
if numCasesPerVector is None:
|
||||||
numCasesPerVector = len(l)-1
|
numCasesPerVector = len(l)
|
||||||
else:
|
else:
|
||||||
# we require this to be uniform... empirically it seems to be!
|
# we require this to be uniform... empirically it seems to be!
|
||||||
assert numCasesPerVector == len(l)-1
|
assert numCasesPerVector == len(l)
|
||||||
|
|
||||||
if MODE == 'array':
|
if MODE == 'array':
|
||||||
|
|
||||||
|
@ -320,7 +324,10 @@ def main():
|
||||||
minErrors = []
|
minErrors = []
|
||||||
for i in xrange(len(stateMap2)-1):
|
for i in xrange(len(stateMap2)-1):
|
||||||
w('// %s -> %s' % (i, stateMap2[i]))
|
w('// %s -> %s' % (i, stateMap2[i]))
|
||||||
v = eval(stateMap2[i])
|
# we replace t-notation as its not relevant here
|
||||||
|
st = stateMap2[i].replace('t', '')
|
||||||
|
|
||||||
|
v = eval(st)
|
||||||
minError = min([-i+e for i, e in v])
|
minError = min([-i+e for i, e in v])
|
||||||
c = len(v)
|
c = len(v)
|
||||||
sum += c
|
sum += c
|
||||||
|
|
|
@ -41,7 +41,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||||
|
|
||||||
// LUCENE-3094
|
// LUCENE-3094
|
||||||
public void testNoWastedStates() throws Exception {
|
public void testNoWastedStates() throws Exception {
|
||||||
AutomatonTestUtil.assertNoDetachedStates(new LevenshteinAutomata("abc").toAutomaton(1));
|
AutomatonTestUtil.assertNoDetachedStates(new LevenshteinAutomata("abc", false).toAutomaton(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -64,31 +64,46 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||||
* up to some maximum distance.
|
* up to some maximum distance.
|
||||||
*/
|
*/
|
||||||
private void assertLev(String s, int maxDistance) {
|
private void assertLev(String s, int maxDistance) {
|
||||||
LevenshteinAutomata builder = new LevenshteinAutomata(s);
|
LevenshteinAutomata builder = new LevenshteinAutomata(s, false);
|
||||||
|
LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true);
|
||||||
Automaton automata[] = new Automaton[maxDistance + 1];
|
Automaton automata[] = new Automaton[maxDistance + 1];
|
||||||
|
Automaton tautomata[] = new Automaton[maxDistance + 1];
|
||||||
for (int n = 0; n < automata.length; n++) {
|
for (int n = 0; n < automata.length; n++) {
|
||||||
automata[n] = builder.toAutomaton(n);
|
automata[n] = builder.toAutomaton(n);
|
||||||
|
tautomata[n] = tbuilder.toAutomaton(n);
|
||||||
assertNotNull(automata[n]);
|
assertNotNull(automata[n]);
|
||||||
|
assertNotNull(tautomata[n]);
|
||||||
assertTrue(automata[n].isDeterministic());
|
assertTrue(automata[n].isDeterministic());
|
||||||
|
assertTrue(tautomata[n].isDeterministic());
|
||||||
assertTrue(SpecialOperations.isFinite(automata[n]));
|
assertTrue(SpecialOperations.isFinite(automata[n]));
|
||||||
|
assertTrue(SpecialOperations.isFinite(tautomata[n]));
|
||||||
AutomatonTestUtil.assertNoDetachedStates(automata[n]);
|
AutomatonTestUtil.assertNoDetachedStates(automata[n]);
|
||||||
|
AutomatonTestUtil.assertNoDetachedStates(tautomata[n]);
|
||||||
// check that the dfa for n-1 accepts a subset of the dfa for n
|
// check that the dfa for n-1 accepts a subset of the dfa for n
|
||||||
if (n > 0) {
|
if (n > 0) {
|
||||||
assertTrue(automata[n-1].subsetOf(automata[n]));
|
assertTrue(automata[n-1].subsetOf(automata[n]));
|
||||||
|
assertTrue(automata[n-1].subsetOf(tautomata[n]));
|
||||||
|
assertTrue(tautomata[n-1].subsetOf(automata[n]));
|
||||||
|
assertTrue(tautomata[n-1].subsetOf(tautomata[n]));
|
||||||
assertNotSame(automata[n-1], automata[n]);
|
assertNotSame(automata[n-1], automata[n]);
|
||||||
}
|
}
|
||||||
|
// check that Lev(N) is a subset of LevT(N)
|
||||||
|
assertTrue(automata[n].subsetOf(tautomata[n]));
|
||||||
// special checks for specific n
|
// special checks for specific n
|
||||||
switch(n) {
|
switch(n) {
|
||||||
case 0:
|
case 0:
|
||||||
// easy, matches the string itself
|
// easy, matches the string itself
|
||||||
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), automata[0]));
|
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), automata[0]));
|
||||||
|
assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), tautomata[0]));
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
// generate a lev1 naively, and check the accepted lang is the same.
|
// generate a lev1 naively, and check the accepted lang is the same.
|
||||||
assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1]));
|
assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1]));
|
||||||
|
assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), tautomata[1]));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assertBruteForce(s, automata[n], n);
|
assertBruteForce(s, automata[n], n);
|
||||||
|
assertBruteForceT(s, tautomata[n], n);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -110,6 +125,17 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return an automaton that accepts all 1-character insertions, deletions,
|
||||||
|
* substitutions, and transpositions of s.
|
||||||
|
*/
|
||||||
|
private Automaton naiveLev1T(String s) {
|
||||||
|
Automaton a = naiveLev1(s);
|
||||||
|
a = BasicOperations.union(a, transpositionsOf(s));
|
||||||
|
MinimizationOperations.minimize(a);
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return an automaton that accepts all 1-character insertions of s (inserting
|
* Return an automaton that accepts all 1-character insertions of s (inserting
|
||||||
* one character)
|
* one character)
|
||||||
|
@ -170,6 +196,29 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return an automaton that accepts all transpositions of s
|
||||||
|
* (transposing two adjacent characters)
|
||||||
|
*/
|
||||||
|
private Automaton transpositionsOf(String s) {
|
||||||
|
if (s.length() < 2)
|
||||||
|
return BasicAutomata.makeEmpty();
|
||||||
|
List<Automaton> list = new ArrayList<Automaton>();
|
||||||
|
for (int i = 0; i < s.length()-1; i++) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append(s.substring(0, i));
|
||||||
|
sb.append(s.charAt(i+1));
|
||||||
|
sb.append(s.charAt(i));
|
||||||
|
sb.append(s.substring(i+2, s.length()));
|
||||||
|
String st = sb.toString();
|
||||||
|
if (!st.equals(s))
|
||||||
|
list.add(BasicAutomata.makeString(st));
|
||||||
|
}
|
||||||
|
Automaton a = BasicOperations.union(list);
|
||||||
|
MinimizationOperations.minimize(a);
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
private void assertBruteForce(String input, Automaton dfa, int distance) {
|
private void assertBruteForce(String input, Automaton dfa, int distance) {
|
||||||
CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
|
CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
|
||||||
int maxLen = input.length() + distance + 1;
|
int maxLen = input.length() + distance + 1;
|
||||||
|
@ -185,6 +234,21 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void assertBruteForceT(String input, Automaton dfa, int distance) {
|
||||||
|
CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
|
||||||
|
int maxLen = input.length() + distance + 1;
|
||||||
|
int maxNum = (int) Math.pow(2, maxLen);
|
||||||
|
for (int i = 0; i < maxNum; i++) {
|
||||||
|
String encoded = Integer.toString(i, 2);
|
||||||
|
boolean accepts = ra.run(encoded);
|
||||||
|
if (accepts) {
|
||||||
|
assertTrue(getTDistance(input, encoded) <= distance);
|
||||||
|
} else {
|
||||||
|
assertTrue(getTDistance(input, encoded) > distance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//*****************************
|
//*****************************
|
||||||
// Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
|
// Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
|
||||||
//*****************************
|
//*****************************
|
||||||
|
@ -260,4 +324,58 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||||
// actually has the most recent cost counts
|
// actually has the most recent cost counts
|
||||||
return Math.abs(p[n]);
|
return Math.abs(p[n]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int getTDistance(String target, String other) {
|
||||||
|
char[] sa;
|
||||||
|
int n;
|
||||||
|
int d[][]; // cost array
|
||||||
|
|
||||||
|
sa = target.toCharArray();
|
||||||
|
n = sa.length;
|
||||||
|
final int m = other.length();
|
||||||
|
d = new int[n+1][m+1];
|
||||||
|
|
||||||
|
if (n == 0 || m == 0) {
|
||||||
|
if (n == m) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return Math.max(n, m);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// indexes into strings s and t
|
||||||
|
int i; // iterates through s
|
||||||
|
int j; // iterates through t
|
||||||
|
|
||||||
|
char t_j; // jth character of t
|
||||||
|
|
||||||
|
int cost; // cost
|
||||||
|
|
||||||
|
for (i = 0; i<=n; i++) {
|
||||||
|
d[i][0] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 0; j<=m; j++) {
|
||||||
|
d[0][j] = j;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 1; j<=m; j++) {
|
||||||
|
t_j = other.charAt(j-1);
|
||||||
|
|
||||||
|
for (i=1; i<=n; i++) {
|
||||||
|
cost = sa[i-1]==t_j ? 0 : 1;
|
||||||
|
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
||||||
|
d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
|
||||||
|
// transposition
|
||||||
|
if (i > 1 && j > 1 && target.charAt(i-1) == other.charAt(j-2) && target.charAt(i-2) == other.charAt(j-1)) {
|
||||||
|
d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// our last action in the above loop was to switch d and p, so p now
|
||||||
|
// actually has the most recent cost counts
|
||||||
|
return Math.abs(d[n][m]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,20 +56,11 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class DirectSpellChecker {
|
public class DirectSpellChecker {
|
||||||
/** The default StringDistance, Levenshtein distance implemented internally
|
/** The default StringDistance, Damerau-Levenshtein distance implemented internally
|
||||||
* via {@link LevenshteinAutomata}.
|
* via {@link LevenshteinAutomata}.
|
||||||
* <p>
|
* <p>
|
||||||
* Note: this is the fastest distance metric, because Levenshtein is used
|
* Note: this is the fastest distance metric, because Damerau-Levenshtein is used
|
||||||
* to draw candidates from the term dictionary: this just re-uses the scoring.
|
* to draw candidates from the term dictionary: this just re-uses the scoring.
|
||||||
* <p>
|
|
||||||
* Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
|
|
||||||
* <ul>
|
|
||||||
* <li> This metric treats full unicode codepoints as characters, but
|
|
||||||
* LevenshteinDistance calculates based on UTF-16 code units.
|
|
||||||
* <li> This metric scales raw edit distances into a floating point score
|
|
||||||
* differently than LevenshteinDistance: the scaling is based upon the
|
|
||||||
* shortest of the two terms instead of the longest.
|
|
||||||
* </ul>
|
|
||||||
*/
|
*/
|
||||||
public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
|
public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
|
||||||
|
|
||||||
|
@ -277,8 +268,8 @@ public class DirectSpellChecker {
|
||||||
* Set the string distance metric.
|
* Set the string distance metric.
|
||||||
* The default is {@link #INTERNAL_LEVENSHTEIN}
|
* The default is {@link #INTERNAL_LEVENSHTEIN}
|
||||||
* <p>
|
* <p>
|
||||||
* Note: because this spellchecker draws its candidates from the
|
* Note: because this spellchecker draws its candidates from the term
|
||||||
* term dictionary using Levenshtein, it works best with an edit-distance-like
|
* dictionary using Damerau-Levenshtein, it works best with an edit-distance-like
|
||||||
* string metric. If you use a different metric than the default,
|
* string metric. If you use a different metric than the default,
|
||||||
* you might want to consider increasing {@link #setMaxInspections(int)}
|
* you might want to consider increasing {@link #setMaxInspections(int)}
|
||||||
* to draw more candidates for your metric to rank.
|
* to draw more candidates for your metric to rank.
|
||||||
|
@ -401,7 +392,7 @@ public class DirectSpellChecker {
|
||||||
if (terms == null) {
|
if (terms == null) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1));
|
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
|
||||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||||
|
|
||||||
BytesRef queryTerm = new BytesRef(term.text());
|
BytesRef queryTerm = new BytesRef(term.text());
|
||||||
|
|
|
@ -20,16 +20,22 @@ package org.apache.lucene.search.spell;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Levenshtein implemented in a consistent way as Lucene's FuzzyTermsEnum.
|
* Damerau-Levenshtein (optimal string alignment) implemented in a consistent
|
||||||
|
* way as Lucene's FuzzyTermsEnum with the transpositions option enabled.
|
||||||
*
|
*
|
||||||
* Note also that this metric differs in subtle ways from {@link LevensteinDistance}:
|
* Notes:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li> This metric treats full unicode codepoints as characters, but
|
* <li> This metric treats full unicode codepoints as characters
|
||||||
* LevenshteinDistance calculates based on UTF-16 code units.
|
|
||||||
* <li> This metric scales raw edit distances into a floating point score
|
* <li> This metric scales raw edit distances into a floating point score
|
||||||
* differently than LevenshteinDistance: the scaling is based upon the
|
* based upon the shortest of the two terms
|
||||||
* shortest of the two terms instead of the longest.
|
* <li> Transpositions of two adjacent codepoints are treated as primitive
|
||||||
|
* edits.
|
||||||
|
* <li> Edits are applied in parallel: for example, "ab" and "bca" have
|
||||||
|
* distance 3.
|
||||||
* </ul>
|
* </ul>
|
||||||
|
*
|
||||||
|
* NOTE: this class is not particularly efficient. It is only intended
|
||||||
|
* for merging results from multiple DirectSpellCheckers.
|
||||||
*/
|
*/
|
||||||
public final class LuceneLevenshteinDistance implements StringDistance {
|
public final class LuceneLevenshteinDistance implements StringDistance {
|
||||||
|
|
||||||
|
@ -38,27 +44,23 @@ public final class LuceneLevenshteinDistance implements StringDistance {
|
||||||
IntsRef targetPoints;
|
IntsRef targetPoints;
|
||||||
IntsRef otherPoints;
|
IntsRef otherPoints;
|
||||||
int n;
|
int n;
|
||||||
int p[]; //'previous' cost array, horizontally
|
int d[][]; // cost array
|
||||||
int d[]; // cost array, horizontally
|
|
||||||
int _d[]; //placeholder to assist in swapping p and d
|
|
||||||
|
|
||||||
// cheaper to do this up front once
|
// cheaper to do this up front once
|
||||||
targetPoints = toIntsRef(target);
|
targetPoints = toIntsRef(target);
|
||||||
otherPoints = toIntsRef(other);
|
otherPoints = toIntsRef(other);
|
||||||
n = targetPoints.length;
|
n = targetPoints.length;
|
||||||
p = new int[n+1];
|
|
||||||
d = new int[n+1];
|
|
||||||
|
|
||||||
final int m = otherPoints.length;
|
final int m = otherPoints.length;
|
||||||
|
d = new int[n+1][m+1];
|
||||||
|
|
||||||
if (n == 0 || m == 0) {
|
if (n == 0 || m == 0) {
|
||||||
if (n == m) {
|
if (n == m) {
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
return Math.max(n, m);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// indexes into strings s and t
|
// indexes into strings s and t
|
||||||
int i; // iterates through s
|
int i; // iterates through s
|
||||||
|
@ -69,28 +71,28 @@ public final class LuceneLevenshteinDistance implements StringDistance {
|
||||||
int cost; // cost
|
int cost; // cost
|
||||||
|
|
||||||
for (i = 0; i<=n; i++) {
|
for (i = 0; i<=n; i++) {
|
||||||
p[i] = i;
|
d[i][0] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 0; j<=m; j++) {
|
||||||
|
d[0][j] = j;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 1; j<=m; j++) {
|
for (j = 1; j<=m; j++) {
|
||||||
t_j = otherPoints.ints[j-1];
|
t_j = otherPoints.ints[j-1];
|
||||||
d[0] = j;
|
|
||||||
|
|
||||||
for (i=1; i<=n; i++) {
|
for (i=1; i<=n; i++) {
|
||||||
cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
|
cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
|
||||||
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
||||||
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
|
d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
|
||||||
|
// transposition
|
||||||
|
if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
|
||||||
|
d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy current distance counts to 'previous row' distance counts
|
return 1.0f - ((float) d[n][m] / Math.min(m, n));
|
||||||
_d = p;
|
|
||||||
p = d;
|
|
||||||
d = _d;
|
|
||||||
}
|
|
||||||
|
|
||||||
// our last action in the above loop was to switch d and p, so p now
|
|
||||||
// actually has the most recent cost counts
|
|
||||||
return 1.0f - ((float) p[n] / Math.min(m, n));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IntsRef toIntsRef(String s) {
|
private static IntsRef toIntsRef(String s) {
|
||||||
|
|
|
@ -214,4 +214,54 @@ public class TestDirectSpellChecker extends LuceneTestCase {
|
||||||
writer.close();
|
writer.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// simple test that transpositions work, we suggest five for fvie with ed=1
|
||||||
|
public void testTransposition() throws Exception {
|
||||||
|
DirectSpellChecker spellChecker = new DirectSpellChecker();
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random, dir,
|
||||||
|
new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
|
||||||
|
|
||||||
|
for (int i = 0; i < 20; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("numbers", English.intToEnglish(i), TextField.TYPE_UNSTORED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
|
||||||
|
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
|
||||||
|
"numbers", "fvie"), 1, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
|
assertEquals(1, similar.length);
|
||||||
|
assertEquals("five", similar[0].string);
|
||||||
|
ir.close();
|
||||||
|
writer.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// simple test that transpositions work, we suggest seventeen for seevntene with ed=2
|
||||||
|
public void testTransposition2() throws Exception {
|
||||||
|
DirectSpellChecker spellChecker = new DirectSpellChecker();
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random, dir,
|
||||||
|
new MockAnalyzer(random, MockTokenizer.SIMPLE, true));
|
||||||
|
|
||||||
|
for (int i = 0; i < 20; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("numbers", English.intToEnglish(i), TextField.TYPE_UNSTORED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
|
||||||
|
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
|
||||||
|
"numbers", "seevntene"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
|
assertEquals(1, similar.length);
|
||||||
|
assertEquals("seventeen", similar[0].string);
|
||||||
|
ir.close();
|
||||||
|
writer.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue