From b08e34722df87e86611ba1afe42cbe7dc052f6e4 Mon Sep 17 00:00:00 2001 From: tang donghai Date: Sun, 7 Aug 2022 22:01:30 +0800 Subject: [PATCH] LUCENE-10646: Add some comment on LevenshteinAutomata (#1016) * add Comment on Lev & pretty the toDot * use auto generate scripts to add comment * update checksum * update checksum * restore toDot * add removeDeadStates in levAutomata Co-authored-by: tangdonghai --- gradle/generation/moman/createLevAutomata.py | 89 ++++++++++++++++++- .../src/generated/checksums/utilGenLev.json | 12 +-- .../lucene/util/automaton/Automaton.java | 5 -- .../automaton/Lev1ParametricDescription.java | 73 +++++++++++++-- .../automaton/Lev1TParametricDescription.java | 7 +- .../automaton/Lev2ParametricDescription.java | 9 +- .../automaton/Lev2TParametricDescription.java | 7 +- .../util/automaton/LevenshteinAutomata.java | 5 +- 8 files changed, 182 insertions(+), 25 deletions(-) diff --git a/gradle/generation/moman/createLevAutomata.py b/gradle/generation/moman/createLevAutomata.py index 0e1b30c6566..f620f348e9f 100644 --- a/gradle/generation/moman/createLevAutomata.py +++ b/gradle/generation/moman/createLevAutomata.py @@ -27,6 +27,73 @@ MODE = 'array' PACKED = True WORD = 64 LOG2_WORD = int(math.log(WORD) / math.log(2)) +HEADER_COMMENT = '''/* + Parametric transitions for LEV1. + ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ + ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ + │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ + │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ + │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ + └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ + char vector is the characteristic vectors in the paper. + entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector. + When i = -1,it means an empty state. + We store this table in toState and offsetIncr. + toState = [ i+1 | for entry in entries]. + offsetIncrs = [j | for entry in entries]. +*/''' + +STATE0_COMMENT = '''/* + * 1 vectors; 2 states per vector; array length = 2 + * Parametric transitions for LEV1 (position = w) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ () │ (1, 0) │ (-1, 0) │ + * └─────────────┴─────────┴─────────┘ + */''' + +STATE1_COMMENT = '''/* + * 2 vectors; 3 states per vector; array length = 6 + * Parametric transitions for LEV1 (position = w-1) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ (0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ + * │ (1) │ (0, 1) │ (1, 1) │ (1, 1) │ + * └─────────────┴─────────┴─────────┴─────────┘ + */''' +STATE2_COMMENT = '''/* + * 4 vectors; 5 states per vector; array length = 20 + * Parametric transitions for LEV1 ( position == w-2 ) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ + * │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ + * │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ + * │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ + * └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ + */''' +STATE3_COMMENT = '''/* + * 8 vectors; 5 states per vector; array length = 40 + * Parametric transitions for LEV1 (0 <= position <= w-3 ) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ (0,0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ + * │ (0,0,1) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (1, 3) │ (1, 3) │ + * │ (0,1,0) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ + * │ (0,1,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (2, 2) │ (1, 3) │ + * │ (1,0,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ + * │ (1,0,1) │ (0, 1) │ (1, 1) │ (1, 1) │ (4, 1) │ (4, 1) │ + * │ (1,1,0) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ + * │ (1,1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (3, 1) │ (4, 1) │ + * └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ + */''' +STATE_COMMENT = [STATE0_COMMENT, STATE1_COMMENT, STATE2_COMMENT, STATE3_COMMENT] # MODE = 'switch' class LineOutput: @@ -144,16 +211,27 @@ def main(): w('// The following code was generated with the moman/finenight pkg') w('// This package is available under the MIT License, see NOTICE.txt') w('// for more details.') + w('// This source file is auto-generated, Please do not modify it directly.') + w('// You should modify the gradle/generation/moman/createAutomata.py instead.') w('') w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') w('') + if not transpose and n == 1: + w(HEADER_COMMENT) + w('') if transpose: w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n) - w(' with transpositions as primitive edits */') + w(' with transpositions as primitive edits.') className = 'Lev%dTParametricDescription' % n else: - w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n) + w('/** Parametric description for generating a Levenshtein automaton of degree %s.' % n) className = 'Lev%dParametricDescription' % n + if not transpose and n == 1: + w('*/') + else: + w(' The comment in Lev1ParametricDescription may be helpful for you to understand this class.') + w(' @see Lev1ParametricDescription') + w('*/') w('class %s extends ParametricDescription {' % className) @@ -300,8 +378,11 @@ def main(): for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): w('') w.outdent() - w('// %d vectors; %d states per vector; array length = %d' % \ + if transpose or n == 2: + w('// %d vectors; %d states per vector; array length = %d' % \ (numVectors, numCasesPerVector, numVectors * numCasesPerVector)) + else: + w(STATE_COMMENT[i]) w.indent() if PACKED: # pack in python @@ -417,7 +498,7 @@ def main(): for sub, repl in subs: s = s.replace(sub, repl) - open(fileOut, 'w').write(s) + open(fileOut, 'w', encoding='utf-8').write(s) print('Wrote %s [%d lines; %.1f KB]' % \ (fileOut, len(w.l), os.path.getsize(fileOut) / 1024.)) diff --git a/lucene/core/src/generated/checksums/utilGenLev.json b/lucene/core/src/generated/checksums/utilGenLev.json index 29855fbfcf4..636c3fd6374 100644 --- a/lucene/core/src/generated/checksums/utilGenLev.json +++ b/lucene/core/src/generated/checksums/utilGenLev.json @@ -1,8 +1,8 @@ { - "gradle/generation/moman/createLevAutomata.py": "b265f527a6ab7c0778f4b2e95de5232931795cad", - "lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java": "8a07d087eba9db1bc228b9dbc4e3b9294dac8478", - "lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java": "a328606a8933fe2f989bf3dbed84aa34fb4113ed", - "lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java": "0d839846eb3cbe0ef62576ab33d63a97c28a8b45", - "lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java": "7c29a828a20f084c4998179fd6a4ee9aa909c1ce", + "gradle/generation/moman/createLevAutomata.py": "d2ebb025aa51a3896f2f9435a756c7d733811b05", + "lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java": "efa1e1aff2c75f32d5f6b35adb5cb346c50c9fe5", + "lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java": "cda745cca726a46ec66b5a4ce1c983414d075f7d", + "lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java": "6e0c90405874800d65e8344b4549cd749e009e90", + "lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java": "291f6d9b23cf7f8fd963d968eaf6569df806c741", "property:source": "https://github.com/jpbarrette/moman/archive/497c90e34e412b6494db6dabf0d95db8034bd325.zip" -} \ No newline at end of file +} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index 3986a9c0ca1..e85fe120b16 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -16,9 +16,6 @@ */ package org.apache.lucene.util.automaton; -// import java.io.IOException; -// import java.io.PrintWriter; - import java.util.Arrays; import java.util.BitSet; import java.util.HashSet; @@ -576,8 +573,6 @@ public class Automaton implements Accountable, TransitionAccessor { * visualizing the automaton. */ public String toDot() { - // TODO: breadth first search so we can get layered output... - StringBuilder b = new StringBuilder(); b.append("digraph Automaton {\n"); b.append(" rankdir = LR\n"); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java index 2e33ad96cdb..744da3cedb1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java @@ -19,10 +19,30 @@ package org.apache.lucene.util.automaton; // The following code was generated with the moman/finenight pkg // This package is available under the MIT License, see NOTICE.txt // for more details. +// This source file is auto-generated, Please do not modify it directly. +// You should modify the gradle/generation/moman/createAutomata.py instead. import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription; -/** Parametric description for generating a Levenshtein automaton of degree 1 */ +/* + Parametric transitions for LEV1. + ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ + ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ + │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ + │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ + │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ + └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ + char vector is the characteristic vectors in the paper. + entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector. + When i = -1,it means an empty state. + We store this table in toState and offsetIncr. + toState = [ i+1 | for entry in entries]. + offsetIncrs = [j | for entry in entries]. +*/ + +/** Parametric description for generating a Levenshtein automaton of degree 1. */ class Lev1ParametricDescription extends ParametricDescription { @Override @@ -70,19 +90,62 @@ class Lev1ParametricDescription extends ParametricDescription { } } - // 1 vectors; 2 states per vector; array length = 2 + /* + * 1 vectors; 2 states per vector; array length = 2 + * Parametric transitions for LEV1 (position = w) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ () │ (1, 0) │ (-1, 0) │ + * └─────────────┴─────────┴─────────┘ + */ private static final long[] toStates0 = new long[] /*2 bits per value */ {0x2L}; private static final long[] offsetIncrs0 = new long[] /*1 bits per value */ {0x0L}; - // 2 vectors; 3 states per vector; array length = 6 + /* + * 2 vectors; 3 states per vector; array length = 6 + * Parametric transitions for LEV1 (position = w-1) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ (0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ + * │ (1) │ (0, 1) │ (1, 1) │ (1, 1) │ + * └─────────────┴─────────┴─────────┴─────────┘ + */ private static final long[] toStates1 = new long[] /*2 bits per value */ {0xa43L}; private static final long[] offsetIncrs1 = new long[] /*1 bits per value */ {0x38L}; - // 4 vectors; 5 states per vector; array length = 20 + /* + * 4 vectors; 5 states per vector; array length = 20 + * Parametric transitions for LEV1 ( position == w-2 ) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ + * │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ + * │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ + * │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ + * └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ + */ private static final long[] toStates2 = new long[] /*3 bits per value */ {0x4da292442420003L}; private static final long[] offsetIncrs2 = new long[] /*2 bits per value */ {0x5555528000L}; - // 8 vectors; 5 states per vector; array length = 40 + /* + * 8 vectors; 5 states per vector; array length = 40 + * Parametric transitions for LEV1 (0 <= position <= w-3 ) + * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ + * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ + * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ + * │ (0,0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ + * │ (0,0,1) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (1, 3) │ (1, 3) │ + * │ (0,1,0) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ + * │ (0,1,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (2, 2) │ (1, 3) │ + * │ (1,0,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ + * │ (1,0,1) │ (0, 1) │ (1, 1) │ (1, 1) │ (4, 1) │ (4, 1) │ + * │ (1,1,0) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ + * │ (1,1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (3, 1) │ (4, 1) │ + * └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ + */ private static final long[] toStates3 = new long[] /*3 bits per value */ {0x14d0812112018003L, 0xb1a29b46d48a49L}; private static final long[] offsetIncrs3 = diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java index f1a1d3ed9ee..02f23dc70bb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java @@ -19,12 +19,17 @@ package org.apache.lucene.util.automaton; // The following code was generated with the moman/finenight pkg // This package is available under the MIT License, see NOTICE.txt // for more details. +// This source file is auto-generated, Please do not modify it directly. +// You should modify the gradle/generation/moman/createAutomata.py instead. import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription; /** * Parametric description for generating a Levenshtein automaton of degree 1, with transpositions as - * primitive edits + * primitive edits. The comment in Lev1ParametricDescription may be helpful for you to understand + * this class. + * + * @see Lev1ParametricDescription */ class Lev1TParametricDescription extends ParametricDescription { diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java index a95a5f16098..356de179762 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java @@ -19,10 +19,17 @@ package org.apache.lucene.util.automaton; // The following code was generated with the moman/finenight pkg // This package is available under the MIT License, see NOTICE.txt // for more details. +// This source file is auto-generated, Please do not modify it directly. +// You should modify the gradle/generation/moman/createAutomata.py instead. import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription; -/** Parametric description for generating a Levenshtein automaton of degree 2 */ +/** + * Parametric description for generating a Levenshtein automaton of degree 2. The comment in + * Lev1ParametricDescription may be helpful for you to understand this class. + * + * @see Lev1ParametricDescription + */ class Lev2ParametricDescription extends ParametricDescription { @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java index 81aaf466939..00aeb4557d1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Lev2TParametricDescription.java @@ -19,12 +19,17 @@ package org.apache.lucene.util.automaton; // The following code was generated with the moman/finenight pkg // This package is available under the MIT License, see NOTICE.txt // for more details. +// This source file is auto-generated, Please do not modify it directly. +// You should modify the gradle/generation/moman/createAutomata.py instead. import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription; /** * Parametric description for generating a Levenshtein automaton of degree 2, with transpositions as - * primitive edits + * primitive edits. The comment in Lev1ParametricDescription may be helpful for you to understand + * this class. + * + * @see Lev1ParametricDescription */ class Lev2TParametricDescription extends ParametricDescription { diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java index 190a5d8ccd2..875c6cd11e1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java @@ -217,8 +217,9 @@ public class LevenshteinAutomata { } a.finishState(); - assert a.isDeterministic(); - return a; + Automaton automaton = Operations.removeDeadStates(a); + assert automaton.isDeterministic(); + return automaton; } /**