LUCENE-10646: Add some comment on LevenshteinAutomata (#1016)

* add Comment on Lev & pretty the toDot

* use auto generate scripts to add comment

* update checksum

* update checksum

* restore toDot

* add removeDeadStates in levAutomata

Co-authored-by: tangdonghai <tangdonghai@meituan.com>
This commit is contained in:
tang donghai 2022-08-07 22:01:30 +08:00 committed by GitHub
parent bd0718f071
commit b08e34722d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 182 additions and 25 deletions

View File

@ -27,6 +27,73 @@ MODE = 'array'
PACKED = True
WORD = 64
LOG2_WORD = int(math.log(WORD) / math.log(2))
HEADER_COMMENT = '''/*
Parametric transitions for LEV1.
char vector State 0 State 1 State 2 State 3 State 4
(0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
(0,1) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
(1,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
(1,1) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
char vector is the characteristic vectors in the paper.
entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector.
When i = -1,it means an empty state.
We store this table in toState and offsetIncr.
toState = [ i+1 | for entry in entries].
offsetIncrs = [j | for entry in entries].
*/'''
STATE0_COMMENT = '''/*
* 1 vectors; 2 states per vector; array length = 2
* Parametric transitions for LEV1 (position = w)
*
* char vector State 0 State 1
*
* () (1, 0) (-1, 0)
*
*/'''
STATE1_COMMENT = '''/*
* 2 vectors; 3 states per vector; array length = 6
* Parametric transitions for LEV1 (position = w-1)
*
* char vector State 0 State 1 State 2
*
* (0) (2, 0) (-1, 0) (-1, 0)
* (1) (0, 1) (1, 1) (1, 1)
*
*/'''
STATE2_COMMENT = '''/*
* 4 vectors; 5 states per vector; array length = 20
* Parametric transitions for LEV1 ( position == w-2 )
*
* char vector State 0 State 1 State 2 State 3 State 4
*
* (0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
* (0,1) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
* (1,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
* (1,1) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
*
*/'''
STATE3_COMMENT = '''/*
* 8 vectors; 5 states per vector; array length = 40
* Parametric transitions for LEV1 (0 <= position <= w-3 )
*
* char vector State 0 State 1 State 2 State 3 State 4
*
* (0,0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
* (0,0,1) (2, 0) (-1, 0) (-1, 0) (1, 3) (1, 3)
* (0,1,0) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
* (0,1,1) (3, 0) (-1, 0) (1, 2) (2, 2) (1, 3)
* (1,0,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
* (1,0,1) (0, 1) (1, 1) (1, 1) (4, 1) (4, 1)
* (1,1,0) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
* (1,1,1) (0, 1) (1, 1) (2, 1) (3, 1) (4, 1)
*
*/'''
STATE_COMMENT = [STATE0_COMMENT, STATE1_COMMENT, STATE2_COMMENT, STATE3_COMMENT]
# MODE = 'switch'
class LineOutput:
@ -144,16 +211,27 @@ def main():
w('// The following code was generated with the moman/finenight pkg')
w('// This package is available under the MIT License, see NOTICE.txt')
w('// for more details.')
w('// This source file is auto-generated, Please do not modify it directly.')
w('// You should modify the gradle/generation/moman/createAutomata.py instead.')
w('')
w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
w('')
if not transpose and n == 1:
w(HEADER_COMMENT)
w('')
if transpose:
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
w(' with transpositions as primitive edits */')
w(' with transpositions as primitive edits.')
className = 'Lev%dTParametricDescription' % n
else:
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
w('/** Parametric description for generating a Levenshtein automaton of degree %s.' % n)
className = 'Lev%dParametricDescription' % n
if not transpose and n == 1:
w('*/')
else:
w(' The comment in Lev1ParametricDescription may be helpful for you to understand this class.')
w(' @see Lev1ParametricDescription')
w('*/')
w('class %s extends ParametricDescription {' % className)
@ -300,8 +378,11 @@ def main():
for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines):
w('')
w.outdent()
w('// %d vectors; %d states per vector; array length = %d' % \
if transpose or n == 2:
w('// %d vectors; %d states per vector; array length = %d' % \
(numVectors, numCasesPerVector, numVectors * numCasesPerVector))
else:
w(STATE_COMMENT[i])
w.indent()
if PACKED:
# pack in python
@ -417,7 +498,7 @@ def main():
for sub, repl in subs:
s = s.replace(sub, repl)
open(fileOut, 'w').write(s)
open(fileOut, 'w', encoding='utf-8').write(s)
print('Wrote %s [%d lines; %.1f KB]' % \
(fileOut, len(w.l), os.path.getsize(fileOut) / 1024.))

View File

@ -1,8 +1,8 @@
{
"gradle/generation/moman/createLevAutomata.py": "b265f527a6ab7c0778f4b2e95de5232931795cad",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java": "8a07d087eba9db1bc228b9dbc4e3b9294dac8478",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java": "a328606a8933fe2f989bf3dbed84aa34fb4113ed",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java": "0d839846eb3cbe0ef62576ab33d63a97c28a8b45",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java": "7c29a828a20f084c4998179fd6a4ee9aa909c1ce",
"gradle/generation/moman/createLevAutomata.py": "d2ebb025aa51a3896f2f9435a756c7d733811b05",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java": "efa1e1aff2c75f32d5f6b35adb5cb346c50c9fe5",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java": "cda745cca726a46ec66b5a4ce1c983414d075f7d",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java": "6e0c90405874800d65e8344b4549cd749e009e90",
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java": "291f6d9b23cf7f8fd963d968eaf6569df806c741",
"property:source": "https://github.com/jpbarrette/moman/archive/497c90e34e412b6494db6dabf0d95db8034bd325.zip"
}

View File

@ -16,9 +16,6 @@
*/
package org.apache.lucene.util.automaton;
// import java.io.IOException;
// import java.io.PrintWriter;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashSet;
@ -576,8 +573,6 @@ public class Automaton implements Accountable, TransitionAccessor {
* visualizing the automaton.
*/
public String toDot() {
// TODO: breadth first search so we can get layered output...
StringBuilder b = new StringBuilder();
b.append("digraph Automaton {\n");
b.append(" rankdir = LR\n");

View File

@ -19,10 +19,30 @@ package org.apache.lucene.util.automaton;
// The following code was generated with the moman/finenight pkg
// This package is available under the MIT License, see NOTICE.txt
// for more details.
// This source file is auto-generated, Please do not modify it directly.
// You should modify the gradle/generation/moman/createAutomata.py instead.
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
/** Parametric description for generating a Levenshtein automaton of degree 1 */
/*
Parametric transitions for LEV1.
char vector State 0 State 1 State 2 State 3 State 4
(0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
(0,1) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
(1,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
(1,1) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
char vector is the characteristic vectors in the paper.
entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector.
When i = -1,it means an empty state.
We store this table in toState and offsetIncr.
toState = [ i+1 | for entry in entries].
offsetIncrs = [j | for entry in entries].
*/
/** Parametric description for generating a Levenshtein automaton of degree 1. */
class Lev1ParametricDescription extends ParametricDescription {
@Override
@ -70,19 +90,62 @@ class Lev1ParametricDescription extends ParametricDescription {
}
}
// 1 vectors; 2 states per vector; array length = 2
/*
* 1 vectors; 2 states per vector; array length = 2
* Parametric transitions for LEV1 (position = w)
*
* char vector State 0 State 1
*
* () (1, 0) (-1, 0)
*
*/
private static final long[] toStates0 = new long[] /*2 bits per value */ {0x2L};
private static final long[] offsetIncrs0 = new long[] /*1 bits per value */ {0x0L};
// 2 vectors; 3 states per vector; array length = 6
/*
* 2 vectors; 3 states per vector; array length = 6
* Parametric transitions for LEV1 (position = w-1)
*
* char vector State 0 State 1 State 2
*
* (0) (2, 0) (-1, 0) (-1, 0)
* (1) (0, 1) (1, 1) (1, 1)
*
*/
private static final long[] toStates1 = new long[] /*2 bits per value */ {0xa43L};
private static final long[] offsetIncrs1 = new long[] /*1 bits per value */ {0x38L};
// 4 vectors; 5 states per vector; array length = 20
/*
* 4 vectors; 5 states per vector; array length = 20
* Parametric transitions for LEV1 ( position == w-2 )
*
* char vector State 0 State 1 State 2 State 3 State 4
*
* (0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
* (0,1) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
* (1,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
* (1,1) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
*
*/
private static final long[] toStates2 = new long[] /*3 bits per value */ {0x4da292442420003L};
private static final long[] offsetIncrs2 = new long[] /*2 bits per value */ {0x5555528000L};
// 8 vectors; 5 states per vector; array length = 40
/*
* 8 vectors; 5 states per vector; array length = 40
* Parametric transitions for LEV1 (0 <= position <= w-3 )
*
* char vector State 0 State 1 State 2 State 3 State 4
*
* (0,0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
* (0,0,1) (2, 0) (-1, 0) (-1, 0) (1, 3) (1, 3)
* (0,1,0) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
* (0,1,1) (3, 0) (-1, 0) (1, 2) (2, 2) (1, 3)
* (1,0,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
* (1,0,1) (0, 1) (1, 1) (1, 1) (4, 1) (4, 1)
* (1,1,0) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
* (1,1,1) (0, 1) (1, 1) (2, 1) (3, 1) (4, 1)
*
*/
private static final long[] toStates3 =
new long[] /*3 bits per value */ {0x14d0812112018003L, 0xb1a29b46d48a49L};
private static final long[] offsetIncrs3 =

View File

@ -19,12 +19,17 @@ package org.apache.lucene.util.automaton;
// The following code was generated with the moman/finenight pkg
// This package is available under the MIT License, see NOTICE.txt
// for more details.
// This source file is auto-generated, Please do not modify it directly.
// You should modify the gradle/generation/moman/createAutomata.py instead.
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
/**
* Parametric description for generating a Levenshtein automaton of degree 1, with transpositions as
* primitive edits
* primitive edits. The comment in Lev1ParametricDescription may be helpful for you to understand
* this class.
*
* @see Lev1ParametricDescription
*/
class Lev1TParametricDescription extends ParametricDescription {

View File

@ -19,10 +19,17 @@ package org.apache.lucene.util.automaton;
// The following code was generated with the moman/finenight pkg
// This package is available under the MIT License, see NOTICE.txt
// for more details.
// This source file is auto-generated, Please do not modify it directly.
// You should modify the gradle/generation/moman/createAutomata.py instead.
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
/** Parametric description for generating a Levenshtein automaton of degree 2 */
/**
* Parametric description for generating a Levenshtein automaton of degree 2. The comment in
* Lev1ParametricDescription may be helpful for you to understand this class.
*
* @see Lev1ParametricDescription
*/
class Lev2ParametricDescription extends ParametricDescription {
@Override

View File

@ -19,12 +19,17 @@ package org.apache.lucene.util.automaton;
// The following code was generated with the moman/finenight pkg
// This package is available under the MIT License, see NOTICE.txt
// for more details.
// This source file is auto-generated, Please do not modify it directly.
// You should modify the gradle/generation/moman/createAutomata.py instead.
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
/**
* Parametric description for generating a Levenshtein automaton of degree 2, with transpositions as
* primitive edits
* primitive edits. The comment in Lev1ParametricDescription may be helpful for you to understand
* this class.
*
* @see Lev1ParametricDescription
*/
class Lev2TParametricDescription extends ParametricDescription {

View File

@ -217,8 +217,9 @@ public class LevenshteinAutomata {
}
a.finishState();
assert a.isDeterministic();
return a;
Automaton automaton = Operations.removeDeadStates(a);
assert automaton.isDeterministic();
return automaton;
}
/**