mirror of https://github.com/apache/lucene.git
LUCENE-10646: Add some comment on LevenshteinAutomata (#1016)
* add Comment on Lev & pretty the toDot * use auto generate scripts to add comment * update checksum * update checksum * restore toDot * add removeDeadStates in levAutomata Co-authored-by: tangdonghai <tangdonghai@meituan.com>
This commit is contained in:
parent
bd0718f071
commit
b08e34722d
|
@ -27,6 +27,73 @@ MODE = 'array'
|
|||
PACKED = True
|
||||
WORD = 64
|
||||
LOG2_WORD = int(math.log(WORD) / math.log(2))
|
||||
HEADER_COMMENT = '''/*
|
||||
Parametric transitions for LEV1.
|
||||
┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃
|
||||
┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
│ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │
|
||||
│ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │
|
||||
│ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │
|
||||
│ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │
|
||||
└─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
|
||||
char vector is the characteristic vectors in the paper.
|
||||
entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector.
|
||||
When i = -1,it means an empty state.
|
||||
We store this table in toState and offsetIncr.
|
||||
toState = [ i+1 | for entry in entries].
|
||||
offsetIncrs = [j | for entry in entries].
|
||||
*/'''
|
||||
|
||||
STATE0_COMMENT = '''/*
|
||||
* 1 vectors; 2 states per vector; array length = 2
|
||||
* Parametric transitions for LEV1 (position = w)
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ () │ (1, 0) │ (-1, 0) │
|
||||
* └─────────────┴─────────┴─────────┘
|
||||
*/'''
|
||||
|
||||
STATE1_COMMENT = '''/*
|
||||
* 2 vectors; 3 states per vector; array length = 6
|
||||
* Parametric transitions for LEV1 (position = w-1)
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ (0) │ (2, 0) │ (-1, 0) │ (-1, 0) │
|
||||
* │ (1) │ (0, 1) │ (1, 1) │ (1, 1) │
|
||||
* └─────────────┴─────────┴─────────┴─────────┘
|
||||
*/'''
|
||||
STATE2_COMMENT = '''/*
|
||||
* 4 vectors; 5 states per vector; array length = 20
|
||||
* Parametric transitions for LEV1 ( position == w-2 )
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │
|
||||
* │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │
|
||||
* │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │
|
||||
* │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │
|
||||
* └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
|
||||
*/'''
|
||||
STATE3_COMMENT = '''/*
|
||||
* 8 vectors; 5 states per vector; array length = 40
|
||||
* Parametric transitions for LEV1 (0 <= position <= w-3 )
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ (0,0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │
|
||||
* │ (0,0,1) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (1, 3) │ (1, 3) │
|
||||
* │ (0,1,0) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │
|
||||
* │ (0,1,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (2, 2) │ (1, 3) │
|
||||
* │ (1,0,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │
|
||||
* │ (1,0,1) │ (0, 1) │ (1, 1) │ (1, 1) │ (4, 1) │ (4, 1) │
|
||||
* │ (1,1,0) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │
|
||||
* │ (1,1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (3, 1) │ (4, 1) │
|
||||
* └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
|
||||
*/'''
|
||||
STATE_COMMENT = [STATE0_COMMENT, STATE1_COMMENT, STATE2_COMMENT, STATE3_COMMENT]
|
||||
# MODE = 'switch'
|
||||
|
||||
class LineOutput:
|
||||
|
@ -144,16 +211,27 @@ def main():
|
|||
w('// The following code was generated with the moman/finenight pkg')
|
||||
w('// This package is available under the MIT License, see NOTICE.txt')
|
||||
w('// for more details.')
|
||||
w('// This source file is auto-generated, Please do not modify it directly.')
|
||||
w('// You should modify the gradle/generation/moman/createAutomata.py instead.')
|
||||
w('')
|
||||
w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
|
||||
w('')
|
||||
if not transpose and n == 1:
|
||||
w(HEADER_COMMENT)
|
||||
w('')
|
||||
if transpose:
|
||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
|
||||
w(' with transpositions as primitive edits */')
|
||||
w(' with transpositions as primitive edits.')
|
||||
className = 'Lev%dTParametricDescription' % n
|
||||
else:
|
||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
|
||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s.' % n)
|
||||
className = 'Lev%dParametricDescription' % n
|
||||
if not transpose and n == 1:
|
||||
w('*/')
|
||||
else:
|
||||
w(' The comment in Lev1ParametricDescription may be helpful for you to understand this class.')
|
||||
w(' @see Lev1ParametricDescription')
|
||||
w('*/')
|
||||
|
||||
w('class %s extends ParametricDescription {' % className)
|
||||
|
||||
|
@ -300,8 +378,11 @@ def main():
|
|||
for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines):
|
||||
w('')
|
||||
w.outdent()
|
||||
w('// %d vectors; %d states per vector; array length = %d' % \
|
||||
if transpose or n == 2:
|
||||
w('// %d vectors; %d states per vector; array length = %d' % \
|
||||
(numVectors, numCasesPerVector, numVectors * numCasesPerVector))
|
||||
else:
|
||||
w(STATE_COMMENT[i])
|
||||
w.indent()
|
||||
if PACKED:
|
||||
# pack in python
|
||||
|
@ -417,7 +498,7 @@ def main():
|
|||
for sub, repl in subs:
|
||||
s = s.replace(sub, repl)
|
||||
|
||||
open(fileOut, 'w').write(s)
|
||||
open(fileOut, 'w', encoding='utf-8').write(s)
|
||||
|
||||
print('Wrote %s [%d lines; %.1f KB]' % \
|
||||
(fileOut, len(w.l), os.path.getsize(fileOut) / 1024.))
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
"gradle/generation/moman/createLevAutomata.py": "b265f527a6ab7c0778f4b2e95de5232931795cad",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java": "8a07d087eba9db1bc228b9dbc4e3b9294dac8478",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java": "a328606a8933fe2f989bf3dbed84aa34fb4113ed",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java": "0d839846eb3cbe0ef62576ab33d63a97c28a8b45",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java": "7c29a828a20f084c4998179fd6a4ee9aa909c1ce",
|
||||
"gradle/generation/moman/createLevAutomata.py": "d2ebb025aa51a3896f2f9435a756c7d733811b05",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java": "efa1e1aff2c75f32d5f6b35adb5cb346c50c9fe5",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java": "cda745cca726a46ec66b5a4ce1c983414d075f7d",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java": "6e0c90405874800d65e8344b4549cd749e009e90",
|
||||
"lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java": "291f6d9b23cf7f8fd963d968eaf6569df806c741",
|
||||
"property:source": "https://github.com/jpbarrette/moman/archive/497c90e34e412b6494db6dabf0d95db8034bd325.zip"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,9 +16,6 @@
|
|||
*/
|
||||
package org.apache.lucene.util.automaton;
|
||||
|
||||
// import java.io.IOException;
|
||||
// import java.io.PrintWriter;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashSet;
|
||||
|
@ -576,8 +573,6 @@ public class Automaton implements Accountable, TransitionAccessor {
|
|||
* visualizing the automaton.
|
||||
*/
|
||||
public String toDot() {
|
||||
// TODO: breadth first search so we can get layered output...
|
||||
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append("digraph Automaton {\n");
|
||||
b.append(" rankdir = LR\n");
|
||||
|
|
|
@ -19,10 +19,30 @@ package org.apache.lucene.util.automaton;
|
|||
// The following code was generated with the moman/finenight pkg
|
||||
// This package is available under the MIT License, see NOTICE.txt
|
||||
// for more details.
|
||||
// This source file is auto-generated, Please do not modify it directly.
|
||||
// You should modify the gradle/generation/moman/createAutomata.py instead.
|
||||
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
|
||||
|
||||
/** Parametric description for generating a Levenshtein automaton of degree 1 */
|
||||
/*
|
||||
Parametric transitions for LEV1.
|
||||
┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃
|
||||
┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
│ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │
|
||||
│ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │
|
||||
│ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │
|
||||
│ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │
|
||||
└─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
|
||||
char vector is the characteristic vectors in the paper.
|
||||
entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector.
|
||||
When i = -1,it means an empty state.
|
||||
We store this table in toState and offsetIncr.
|
||||
toState = [ i+1 | for entry in entries].
|
||||
offsetIncrs = [j | for entry in entries].
|
||||
*/
|
||||
|
||||
/** Parametric description for generating a Levenshtein automaton of degree 1. */
|
||||
class Lev1ParametricDescription extends ParametricDescription {
|
||||
|
||||
@Override
|
||||
|
@ -70,19 +90,62 @@ class Lev1ParametricDescription extends ParametricDescription {
|
|||
}
|
||||
}
|
||||
|
||||
// 1 vectors; 2 states per vector; array length = 2
|
||||
/*
|
||||
* 1 vectors; 2 states per vector; array length = 2
|
||||
* Parametric transitions for LEV1 (position = w)
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ () │ (1, 0) │ (-1, 0) │
|
||||
* └─────────────┴─────────┴─────────┘
|
||||
*/
|
||||
private static final long[] toStates0 = new long[] /*2 bits per value */ {0x2L};
|
||||
private static final long[] offsetIncrs0 = new long[] /*1 bits per value */ {0x0L};
|
||||
|
||||
// 2 vectors; 3 states per vector; array length = 6
|
||||
/*
|
||||
* 2 vectors; 3 states per vector; array length = 6
|
||||
* Parametric transitions for LEV1 (position = w-1)
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ (0) │ (2, 0) │ (-1, 0) │ (-1, 0) │
|
||||
* │ (1) │ (0, 1) │ (1, 1) │ (1, 1) │
|
||||
* └─────────────┴─────────┴─────────┴─────────┘
|
||||
*/
|
||||
private static final long[] toStates1 = new long[] /*2 bits per value */ {0xa43L};
|
||||
private static final long[] offsetIncrs1 = new long[] /*1 bits per value */ {0x38L};
|
||||
|
||||
// 4 vectors; 5 states per vector; array length = 20
|
||||
/*
|
||||
* 4 vectors; 5 states per vector; array length = 20
|
||||
* Parametric transitions for LEV1 ( position == w-2 )
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │
|
||||
* │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │
|
||||
* │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │
|
||||
* │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │
|
||||
* └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
|
||||
*/
|
||||
private static final long[] toStates2 = new long[] /*3 bits per value */ {0x4da292442420003L};
|
||||
private static final long[] offsetIncrs2 = new long[] /*2 bits per value */ {0x5555528000L};
|
||||
|
||||
// 8 vectors; 5 states per vector; array length = 40
|
||||
/*
|
||||
* 8 vectors; 5 states per vector; array length = 40
|
||||
* Parametric transitions for LEV1 (0 <= position <= w-3 )
|
||||
* ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
|
||||
* ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃
|
||||
* ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
|
||||
* │ (0,0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │
|
||||
* │ (0,0,1) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (1, 3) │ (1, 3) │
|
||||
* │ (0,1,0) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │
|
||||
* │ (0,1,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (2, 2) │ (1, 3) │
|
||||
* │ (1,0,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │
|
||||
* │ (1,0,1) │ (0, 1) │ (1, 1) │ (1, 1) │ (4, 1) │ (4, 1) │
|
||||
* │ (1,1,0) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │
|
||||
* │ (1,1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (3, 1) │ (4, 1) │
|
||||
* └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
|
||||
*/
|
||||
private static final long[] toStates3 =
|
||||
new long[] /*3 bits per value */ {0x14d0812112018003L, 0xb1a29b46d48a49L};
|
||||
private static final long[] offsetIncrs3 =
|
||||
|
|
|
@ -19,12 +19,17 @@ package org.apache.lucene.util.automaton;
|
|||
// The following code was generated with the moman/finenight pkg
|
||||
// This package is available under the MIT License, see NOTICE.txt
|
||||
// for more details.
|
||||
// This source file is auto-generated, Please do not modify it directly.
|
||||
// You should modify the gradle/generation/moman/createAutomata.py instead.
|
||||
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
|
||||
|
||||
/**
|
||||
* Parametric description for generating a Levenshtein automaton of degree 1, with transpositions as
|
||||
* primitive edits
|
||||
* primitive edits. The comment in Lev1ParametricDescription may be helpful for you to understand
|
||||
* this class.
|
||||
*
|
||||
* @see Lev1ParametricDescription
|
||||
*/
|
||||
class Lev1TParametricDescription extends ParametricDescription {
|
||||
|
||||
|
|
|
@ -19,10 +19,17 @@ package org.apache.lucene.util.automaton;
|
|||
// The following code was generated with the moman/finenight pkg
|
||||
// This package is available under the MIT License, see NOTICE.txt
|
||||
// for more details.
|
||||
// This source file is auto-generated, Please do not modify it directly.
|
||||
// You should modify the gradle/generation/moman/createAutomata.py instead.
|
||||
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
|
||||
|
||||
/** Parametric description for generating a Levenshtein automaton of degree 2 */
|
||||
/**
|
||||
* Parametric description for generating a Levenshtein automaton of degree 2. The comment in
|
||||
* Lev1ParametricDescription may be helpful for you to understand this class.
|
||||
*
|
||||
* @see Lev1ParametricDescription
|
||||
*/
|
||||
class Lev2ParametricDescription extends ParametricDescription {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,12 +19,17 @@ package org.apache.lucene.util.automaton;
|
|||
// The following code was generated with the moman/finenight pkg
|
||||
// This package is available under the MIT License, see NOTICE.txt
|
||||
// for more details.
|
||||
// This source file is auto-generated, Please do not modify it directly.
|
||||
// You should modify the gradle/generation/moman/createAutomata.py instead.
|
||||
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
|
||||
|
||||
/**
|
||||
* Parametric description for generating a Levenshtein automaton of degree 2, with transpositions as
|
||||
* primitive edits
|
||||
* primitive edits. The comment in Lev1ParametricDescription may be helpful for you to understand
|
||||
* this class.
|
||||
*
|
||||
* @see Lev1ParametricDescription
|
||||
*/
|
||||
class Lev2TParametricDescription extends ParametricDescription {
|
||||
|
||||
|
|
|
@ -217,8 +217,9 @@ public class LevenshteinAutomata {
|
|||
}
|
||||
|
||||
a.finishState();
|
||||
assert a.isDeterministic();
|
||||
return a;
|
||||
Automaton automaton = Operations.removeDeadStates(a);
|
||||
assert automaton.isDeterministic();
|
||||
return automaton;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue