# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Note, this file is known to work with rev 120 of the moman # repository (http://bitbucket.org/jpbarrette/moman/overview) # # See also: http://sites.google.com/site/rrettesite/moman import math import os import sys MODE = 'array' PACKED = True WORD = 64 LOG2_WORD = int(math.log(WORD) / math.log(2)) HEADER_COMMENT = '''/* Parametric transitions for LEV1. ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ char vector is the characteristic vectors in the paper. entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector. When i = -1,it means an empty state. We store this table in toState and offsetIncr. toState = [ i+1 | for entry in entries]. offsetIncrs = [j | for entry in entries]. */''' STATE0_COMMENT = '''/* * 1 vectors; 2 states per vector; array length = 2 * Parametric transitions for LEV1 (position = w) * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ * ┃ char vector ┃ State 0 ┃ State 1 ┃ * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ * │ () │ (1, 0) │ (-1, 0) │ * └─────────────┴─────────┴─────────┘ */''' STATE1_COMMENT = '''/* * 2 vectors; 3 states per vector; array length = 6 * Parametric transitions for LEV1 (position = w-1) * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ * │ (0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ * │ (1) │ (0, 1) │ (1, 1) │ (1, 1) │ * └─────────────┴─────────┴─────────┴─────────┘ */''' STATE2_COMMENT = '''/* * 4 vectors; 5 states per vector; array length = 20 * Parametric transitions for LEV1 ( position == w-2 ) * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ * │ (0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ * │ (0,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ * │ (1,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ * │ (1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ * └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ */''' STATE3_COMMENT = '''/* * 8 vectors; 5 states per vector; array length = 40 * Parametric transitions for LEV1 (0 <= position <= w-3 ) * ┏━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ * ┃ char vector ┃ State 0 ┃ State 1 ┃ State 2 ┃ State 3 ┃ State 4 ┃ * ┡━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ * │ (0,0,0) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ (-1, 0) │ * │ (0,0,1) │ (2, 0) │ (-1, 0) │ (-1, 0) │ (1, 3) │ (1, 3) │ * │ (0,1,0) │ (3, 0) │ (-1, 0) │ (1, 2) │ (1, 2) │ (-1, 0) │ * │ (0,1,1) │ (3, 0) │ (-1, 0) │ (1, 2) │ (2, 2) │ (1, 3) │ * │ (1,0,0) │ (0, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ (1, 1) │ * │ (1,0,1) │ (0, 1) │ (1, 1) │ (1, 1) │ (4, 1) │ (4, 1) │ * │ (1,1,0) │ (0, 1) │ (1, 1) │ (2, 1) │ (2, 1) │ (1, 1) │ * │ (1,1,1) │ (0, 1) │ (1, 1) │ (2, 1) │ (3, 1) │ (4, 1) │ * └─────────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ */''' STATE_COMMENT = [STATE0_COMMENT, STATE1_COMMENT, STATE2_COMMENT, STATE3_COMMENT] # MODE = 'switch' class LineOutput: def __init__(self, indent=''): self.l = [] self._indent = self.startIndent = indent self.inComment = False def __call__(self, s, indent=0): if s.find('}') != -1: assert self._indent != self.startIndent self._indent = self._indent[:-2] if indent != 0: indent0 = ' ' * (len(self._indent) // 2 + indent) else: indent0 = self._indent if s.find('/*') != -1: if s.find('*/') == -1: self.inComment = True elif s.find('*/') != -1: self.inComment = True if self.inComment: self.l.append(indent0 + s) else: self.l.append(indent0 + s.lstrip()) self.inComment = self.inComment and s.find('*/') == -1 if s.find('{') != -1: self._indent += ' ' def __str__(self): if True: assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \ (len(self._indent), len(self.startIndent)) return '\n'.join(self.l) def indent(self): self._indent += ' ' def outdent(self): assert self._indent != self.startIndent self._indent = self._indent[:-2] def charVarNumber(charVar): """ Maps binary number (eg [1, 0, 1]) to its decimal value (5). """ p = 1 sum = 0 downTo = len(charVar) - 1 while downTo >= 0: sum += p * int(charVar[downTo]) p *= 2 downTo -= 1 return sum def main(): if len(sys.argv) != 4: print() print('Usage: python -u %s N path_to_moman_dir' % sys.argv[0]) print() print('NOTE: the resulting .java file is created in the current working dir!') print() sys.exit(1) n = int(sys.argv[1]) transpose = (sys.argv[2] == "True") sys.path.insert(0, sys.argv[3]) try: from possibleStates import genTransitions except ImportError: from finenight.possibleStates import genTransitions tables = genTransitions(n, transpose) stateMap = {} # init null state stateMap['[]'] = -1 # init start state stateMap['[(0, 0)]'] = 0 w = LineOutput() w('/*') w(' * Licensed to the Apache Software Foundation (ASF) under one or more') w(' * contributor license agreements. See the NOTICE file distributed with') w(' * this work for additional information regarding copyright ownership.') w(' * The ASF licenses this file to You under the Apache License, Version 2.0') w(' * (the "License"); you may not use this file except in compliance with') w(' * the License. You may obtain a copy of the License at') w(' *') w(' * http://www.apache.org/licenses/LICENSE-2.0') w(' *') w(' * Unless required by applicable law or agreed to in writing, software') w(' * distributed under the License is distributed on an "AS IS" BASIS,') w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.') w(' * See the License for the specific language governing permissions and') w(' * limitations under the License.') w(' */') w('package org.apache.lucene.util.automaton;') w('') w('// The following code was generated with the moman/finenight pkg') w('// This package is available under the MIT License, see NOTICE.txt') w('// for more details.') w('// This source file is auto-generated, Please do not modify it directly.') w('// You should modify the gradle/generation/moman/createAutomata.py instead.') w('') w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') w('') if not transpose and n == 1: w(HEADER_COMMENT) w('') if transpose: w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n) w(' with transpositions as primitive edits.') className = 'Lev%dTParametricDescription' % n else: w('/** Parametric description for generating a Levenshtein automaton of degree %s.' % n) className = 'Lev%dParametricDescription' % n if not transpose and n == 1: w('*/') else: w(' The comment in Lev1ParametricDescription may be helpful for you to understand this class.') w(' @see Lev1ParametricDescription') w('*/') w('class %s extends ParametricDescription {' % className) w('') w('@Override') w('int transition(int absState, int position, int vector) {') w(' // null absState should never be passed in') w(' assert absState != -1;') w('') w(' // decode absState -> state, offset') w(' int state = absState/(w+1);') w(' int offset = absState%(w+1);') w(' assert offset >= 0;') w('') machines = [] for i, map in enumerate(tables): if i == 0: w('if (position == w) {') elif i == len(tables) - 1: w('} else {') else: w('} else if (position == w-%d) {' % i) if i != 0 and MODE == 'switch': w('switch(vector) {') l = list(map.items()) l.sort() numCasesPerVector = None numVectors = len(l) if MODE == 'array': toStateArray = [] toOffsetIncrArray = [] for charVar, states in l: # somehow it's a string: charVar = eval(charVar) if i != 0 and MODE == 'switch': w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar]))) w.indent() l = list(states.items()) l.sort() byFromState = {} # first pass to assign states byAction = {} for s, (toS, offset) in l: state = str(s) toState = str(toS) if state not in stateMap: stateMap[state] = len(stateMap) - 1 if toState not in stateMap: stateMap[toState] = len(stateMap) - 1 byFromState[stateMap[state]] = (1 + stateMap[toState], offset) fromStateDesc = s[1:len(s) - 1] toStateDesc = ', '.join([str(x) for x in toS]) tup = (stateMap[toState], toStateDesc, offset) if tup not in byAction: byAction[tup] = [] byAction[tup].append((fromStateDesc, stateMap[state])) if numCasesPerVector is None: numCasesPerVector = len(l) else: # we require this to be uniform... empirically it seems to be! assert numCasesPerVector == len(l) if MODE == 'array': for s in range(numCasesPerVector): toState, offsetIncr = byFromState[s] toStateArray.append(toState) toOffsetIncrArray.append(offsetIncr) else: # render switches w('switch(state) { // %s cases' % len(l)) for (toState, toStateDesc, offset), lx in byAction.items(): for fromStateDesc, fromState in lx: w('case %s: // %s' % (fromState, fromStateDesc)) w.indent() w(' state = %s; // %s' % (toState, toStateDesc)) if offset > 0: w(' offset += %s;' % offset) w('break;') w.outdent() w('}') if i != 0: w('break;') w.outdent() if MODE == 'array': # strangely state can come in wildly out of bounds.... w(' if (state < %d) {' % numCasesPerVector) w(' final int loc = vector * %d + state;' % numCasesPerVector) if PACKED: w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i)) w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i)) else: w(' offset += offsetIncrs%d[loc];' % i) w(' state = toStates%d[loc]-1;' % i) w(' }') elif i != 0: w('}') machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors)) # ends switch statement for machine w('}') w('') w(' if (state == -1) {') w(' // null state') w(' return -1;') w(' } else {') w(' // translate back to abs') w(' return state*(w+1)+offset;') w(' }') # ends transition method w('}') subs = [] if MODE == 'array': w.indent() for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): w('') w.outdent() if transpose or n == 2: w('// %d vectors; %d states per vector; array length = %d' % \ (numVectors, numCasesPerVector, numVectors * numCasesPerVector)) else: w(STATE_COMMENT[i]) w.indent() if PACKED: # pack in python l, nbits = pack(toStateArray) subs.append(('NBITSSTATES%d' % i, str(nbits))) w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \ (i, nbits, renderList([(hex(int(x)) + "L") for x in l]))) l, nbits = pack(toOffsetIncrsArray) subs.append(('NBITSOFFSET%d' % i, str(nbits))) w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \ (i, nbits, renderList([(hex(int(x)) + "L") for x in l]))) else: w(' private final static int[] toStates%d = new int[] %s;' % \ (i, renderList([str(x) for x in toStateArray]))) w(' private final static int[] offsetIncrs%d = new int[] %s;' % \ (i, renderList([str(x) for x in toStateArray]))) w.outdent() stateMap2 = dict([[v, k] for k, v in stateMap.items()]) w('') w('// state map') sum = 0 minErrors = [] for i in range(len(stateMap2) - 1): w('// %s -> %s' % (i, stateMap2[i])) # we replace t-notation as it's not relevant here st = stateMap2[i].replace('t', '') v = eval(st) minError = min([-i + e for i, e in v]) c = len(v) sum += c minErrors.append(minError) w('') w.indent() # w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors])) w.outdent() w('') w(' public %s(int w) {' % className) w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1) w(' }') if 0: w('') w('@Override') w('public int size() { // this can now move up?') w(' return %d*(w+1);' % (len(stateMap2) - 1)) w('}') w('') w('@Override') w('public int getPosition(int absState) { // this can now move up?') w(' return absState % (w+1);') w('}') w('') w('@Override') w('public boolean isAccept(int absState) { // this can now move up?') w(' // decode absState -> state, offset') w(' int state = absState/(w+1);') w(' if (true || state < minErrors.length) {') w(' int offset = absState%(w+1);') w(' assert offset >= 0;') w(' return w - offset + minErrors[state] <= %d;' % n) w(' } else {') w(' return false;') w(' }') w('}') if MODE == 'array' and PACKED: # we moved into super class if False: w('') v = 2 l = [] for i in range(63): l.append(hex(v - 1)) v *= 2 w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1) w('') # unpack in java w('private int unpack(long[] data, int index, int bitsPerValue) {') w(' final long bitLoc = bitsPerValue * index;') w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD) w(' final int bitStart = (int) (bitLoc & %d);' % (WORD - 1)) w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);') w(' if (bitStart + bitsPerValue <= %d) {' % WORD) w(' // not split') w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);') w(' } else {') w(' // split') w(' final int part = %d-bitStart;' % WORD) w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +') w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1) w(' }') w('}') # class w('}') w('') fileOut = '%s.java' % className s = str(w) for sub, repl in subs: s = s.replace(sub, repl) open(fileOut, 'w', encoding='utf-8').write(s) print('Wrote %s [%d lines; %.1f KB]' % \ (fileOut, len(w.l), os.path.getsize(fileOut) / 1024.)) def renderList(l): lx = [' '] for i in range(len(l)): if i > 0: lx.append(',') if i % 4 == 0: lx.append('\n ') lx.append(l[i]) return '{\n%s\n }' % ''.join(lx) MASKS = [] v = 2 for i in range(63): MASKS.append(v - 1) v *= 2 # packs into longs; returns long[], numBits def pack(l): maxV = max(l) bitsPerValue = max(1, int(math.ceil(math.log(maxV + 1) / math.log(2.0)))) bitsLeft = WORD pendingValue = 0 packed = [] for i in range(len(l)): v = l[i] if pendingValue > 0: bitsUsed = math.ceil(math.log(pendingValue) / math.log(2.0)) assert bitsUsed <= (WORD - bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD - bitsLeft, bitsUsed) if bitsLeft >= bitsPerValue: pendingValue += v << (WORD - bitsLeft) bitsLeft -= bitsPerValue if bitsLeft == 0: packed.append(pendingValue) bitsLeft = WORD pendingValue = 0 else: # split # bottom bitsLeft go in current word: pendingValue += (v & MASKS[bitsLeft - 1]) << (WORD - bitsLeft) packed.append(pendingValue) pendingValue = v >> bitsLeft bitsLeft = WORD - (bitsPerValue - bitsLeft) if bitsLeft < WORD: packed.append(pendingValue) # verify(l, packed, bitsPerValue) return packed, bitsPerValue def verify(data, packedData, bitsPerValue): for i in range(len(data)): assert data[i] == unpack(packedData, i, bitsPerValue) def unpack(data, index, bitsPerValue): bitLoc = bitsPerValue * index dataLoc = int(bitLoc >> LOG2_WORD) bitStart = int(bitLoc & (WORD - 1)) if bitStart + bitsPerValue <= WORD: # not split return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue - 1])) else: # split part = WORD - bitStart; return int((((data[dataLoc] >> bitStart) & MASKS[part - 1]) + ((data[1 + dataLoc] & MASKS[bitsPerValue - part - 1]) << part))) if __name__ == '__main__': if not __debug__: print() print('ERROR: please run without -O') print() sys.exit(1) main()