lucene/gradle/generation/moman/createLevAutomata.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

585 lines
20 KiB
Python
Raw Normal View History

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Note, this file is known to work with rev 120 of the moman
# repository (http://bitbucket.org/jpbarrette/moman/overview)
#
# See also: http://sites.google.com/site/rrettesite/moman
import math
import os
import sys
MODE = 'array'
PACKED = True
WORD = 64
LOG2_WORD = int(math.log(WORD) / math.log(2))
HEADER_COMMENT = '''/*
Parametric transitions for LEV1.
char vector State 0 State 1 State 2 State 3 State 4
(0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
(0,1) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
(1,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
(1,1) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
char vector is the characteristic vectors in the paper.
entry (i,j) in the table means next transitions state is i, next offset is j + currentOffset if we meet the according char vector.
When i = -1,it means an empty state.
We store this table in toState and offsetIncr.
toState = [ i+1 | for entry in entries].
offsetIncrs = [j | for entry in entries].
*/'''
STATE0_COMMENT = '''/*
* 1 vectors; 2 states per vector; array length = 2
* Parametric transitions for LEV1 (position = w)
*
* char vector State 0 State 1
*
* () (1, 0) (-1, 0)
*
*/'''
STATE1_COMMENT = '''/*
* 2 vectors; 3 states per vector; array length = 6
* Parametric transitions for LEV1 (position = w-1)
*
* char vector State 0 State 1 State 2
*
* (0) (2, 0) (-1, 0) (-1, 0)
* (1) (0, 1) (1, 1) (1, 1)
*
*/'''
STATE2_COMMENT = '''/*
* 4 vectors; 5 states per vector; array length = 20
* Parametric transitions for LEV1 ( position == w-2 )
*
* char vector State 0 State 1 State 2 State 3 State 4
*
* (0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
* (0,1) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
* (1,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
* (1,1) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
*
*/'''
STATE3_COMMENT = '''/*
* 8 vectors; 5 states per vector; array length = 40
* Parametric transitions for LEV1 (0 <= position <= w-3 )
*
* char vector State 0 State 1 State 2 State 3 State 4
*
* (0,0,0) (2, 0) (-1, 0) (-1, 0) (-1, 0) (-1, 0)
* (0,0,1) (2, 0) (-1, 0) (-1, 0) (1, 3) (1, 3)
* (0,1,0) (3, 0) (-1, 0) (1, 2) (1, 2) (-1, 0)
* (0,1,1) (3, 0) (-1, 0) (1, 2) (2, 2) (1, 3)
* (1,0,0) (0, 1) (1, 1) (1, 1) (1, 1) (1, 1)
* (1,0,1) (0, 1) (1, 1) (1, 1) (4, 1) (4, 1)
* (1,1,0) (0, 1) (1, 1) (2, 1) (2, 1) (1, 1)
* (1,1,1) (0, 1) (1, 1) (2, 1) (3, 1) (4, 1)
*
*/'''
STATE_COMMENT = [STATE0_COMMENT, STATE1_COMMENT, STATE2_COMMENT, STATE3_COMMENT]
# MODE = 'switch'
class LineOutput:
def __init__(self, indent=''):
self.l = []
self._indent = self.startIndent = indent
self.inComment = False
def __call__(self, s, indent=0):
if s.find('}') != -1:
assert self._indent != self.startIndent
self._indent = self._indent[:-2]
if indent != 0:
indent0 = ' ' * (len(self._indent) // 2 + indent)
else:
indent0 = self._indent
if s.find('/*') != -1:
if s.find('*/') == -1:
self.inComment = True
elif s.find('*/') != -1:
self.inComment = True
if self.inComment:
self.l.append(indent0 + s)
else:
self.l.append(indent0 + s.lstrip())
self.inComment = self.inComment and s.find('*/') == -1
if s.find('{') != -1:
self._indent += ' '
def __str__(self):
if True:
assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \
(len(self._indent), len(self.startIndent))
return '\n'.join(self.l)
def indent(self):
self._indent += ' '
def outdent(self):
assert self._indent != self.startIndent
self._indent = self._indent[:-2]
def charVarNumber(charVar):
"""
Maps binary number (eg [1, 0, 1]) to its decimal value (5).
"""
p = 1
sum = 0
downTo = len(charVar) - 1
while downTo >= 0:
sum += p * int(charVar[downTo])
p *= 2
downTo -= 1
return sum
def main():
if len(sys.argv) != 4:
print()
print('Usage: python -u %s N <True/False> path_to_moman_dir' % sys.argv[0])
print()
print('NOTE: the resulting .java file is created in the current working dir!')
print()
sys.exit(1)
n = int(sys.argv[1])
transpose = (sys.argv[2] == "True")
sys.path.insert(0, sys.argv[3])
try:
from possibleStates import genTransitions
except ImportError:
from finenight.possibleStates import genTransitions
tables = genTransitions(n, transpose)
stateMap = {}
# init null state
stateMap['[]'] = -1
# init start state
stateMap['[(0, 0)]'] = 0
w = LineOutput()
w('/*')
w(' * Licensed to the Apache Software Foundation (ASF) under one or more')
w(' * contributor license agreements. See the NOTICE file distributed with')
w(' * this work for additional information regarding copyright ownership.')
w(' * The ASF licenses this file to You under the Apache License, Version 2.0')
w(' * (the "License"); you may not use this file except in compliance with')
w(' * the License. You may obtain a copy of the License at')
w(' *')
w(' * http://www.apache.org/licenses/LICENSE-2.0')
w(' *')
w(' * Unless required by applicable law or agreed to in writing, software')
w(' * distributed under the License is distributed on an "AS IS" BASIS,')
w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.')
w(' * See the License for the specific language governing permissions and')
w(' * limitations under the License.')
w(' */')
w('package org.apache.lucene.util.automaton;')
w('')
w('// The following code was generated with the moman/finenight pkg')
w('// This package is available under the MIT License, see NOTICE.txt')
w('// for more details.')
w('// This source file is auto-generated, Please do not modify it directly.')
w('// You should modify the gradle/generation/moman/createAutomata.py instead.')
w('')
w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
w('')
if not transpose and n == 1:
w(HEADER_COMMENT)
w('')
if transpose:
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
w(' with transpositions as primitive edits.')
className = 'Lev%dTParametricDescription' % n
else:
w('/** Parametric description for generating a Levenshtein automaton of degree %s.' % n)
className = 'Lev%dParametricDescription' % n
if not transpose and n == 1:
w('*/')
else:
w(' The comment in Lev1ParametricDescription may be helpful for you to understand this class.')
w(' @see Lev1ParametricDescription')
w('*/')
w('class %s extends ParametricDescription {' % className)
w('')
w('@Override')
w('int transition(int absState, int position, int vector) {')
w(' // null absState should never be passed in')
w(' assert absState != -1;')
w('')
w(' // decode absState -> state, offset')
w(' int state = absState/(w+1);')
w(' int offset = absState%(w+1);')
w(' assert offset >= 0;')
w('')
machines = []
for i, map in enumerate(tables):
if i == 0:
w('if (position == w) {')
elif i == len(tables) - 1:
w('} else {')
else:
w('} else if (position == w-%d) {' % i)
if i != 0 and MODE == 'switch':
w('switch(vector) {')
l = list(map.items())
l.sort()
numCasesPerVector = None
numVectors = len(l)
if MODE == 'array':
toStateArray = []
toOffsetIncrArray = []
for charVar, states in l:
# somehow it's a string:
charVar = eval(charVar)
if i != 0 and MODE == 'switch':
w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar])))
w.indent()
l = list(states.items())
l.sort()
byFromState = {}
# first pass to assign states
byAction = {}
for s, (toS, offset) in l:
state = str(s)
toState = str(toS)
if state not in stateMap:
stateMap[state] = len(stateMap) - 1
if toState not in stateMap:
stateMap[toState] = len(stateMap) - 1
byFromState[stateMap[state]] = (1 + stateMap[toState], offset)
fromStateDesc = s[1:len(s) - 1]
toStateDesc = ', '.join([str(x) for x in toS])
tup = (stateMap[toState], toStateDesc, offset)
if tup not in byAction:
byAction[tup] = []
byAction[tup].append((fromStateDesc, stateMap[state]))
if numCasesPerVector is None:
numCasesPerVector = len(l)
else:
# we require this to be uniform... empirically it seems to be!
assert numCasesPerVector == len(l)
if MODE == 'array':
for s in range(numCasesPerVector):
toState, offsetIncr = byFromState[s]
toStateArray.append(toState)
toOffsetIncrArray.append(offsetIncr)
else:
# render switches
w('switch(state) { // %s cases' % len(l))
for (toState, toStateDesc, offset), lx in byAction.items():
for fromStateDesc, fromState in lx:
w('case %s: // %s' % (fromState, fromStateDesc))
w.indent()
w(' state = %s; // %s' % (toState, toStateDesc))
if offset > 0:
w(' offset += %s;' % offset)
w('break;')
w.outdent()
w('}')
if i != 0:
w('break;')
w.outdent()
if MODE == 'array':
# strangely state can come in wildly out of bounds....
w(' if (state < %d) {' % numCasesPerVector)
w(' final int loc = vector * %d + state;' % numCasesPerVector)
if PACKED:
w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i))
w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i))
else:
w(' offset += offsetIncrs%d[loc];' % i)
w(' state = toStates%d[loc]-1;' % i)
w(' }')
elif i != 0:
w('}')
machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors))
# ends switch statement for machine
w('}')
w('')
w(' if (state == -1) {')
w(' // null state')
w(' return -1;')
w(' } else {')
w(' // translate back to abs')
w(' return state*(w+1)+offset;')
w(' }')
# ends transition method
w('}')
subs = []
if MODE == 'array':
w.indent()
for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines):
w('')
w.outdent()
if transpose or n == 2:
w('// %d vectors; %d states per vector; array length = %d' % \
(numVectors, numCasesPerVector, numVectors * numCasesPerVector))
else:
w(STATE_COMMENT[i])
w.indent()
if PACKED:
# pack in python
l, nbits = pack(toStateArray)
subs.append(('NBITSSTATES%d' % i, str(nbits)))
w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \
(i, nbits, renderList([(hex(int(x)) + "L") for x in l])))
l, nbits = pack(toOffsetIncrsArray)
subs.append(('NBITSOFFSET%d' % i, str(nbits)))
w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \
(i, nbits, renderList([(hex(int(x)) + "L") for x in l])))
else:
w(' private final static int[] toStates%d = new int[] %s;' % \
(i, renderList([str(x) for x in toStateArray])))
w(' private final static int[] offsetIncrs%d = new int[] %s;' % \
(i, renderList([str(x) for x in toStateArray])))
w.outdent()
stateMap2 = dict([[v, k] for k, v in stateMap.items()])
w('')
w('// state map')
sum = 0
minErrors = []
for i in range(len(stateMap2) - 1):
w('// %s -> %s' % (i, stateMap2[i]))
# we replace t-notation as it's not relevant here
st = stateMap2[i].replace('t', '')
v = eval(st)
minError = min([-i + e for i, e in v])
c = len(v)
sum += c
minErrors.append(minError)
w('')
w.indent()
# w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors]))
w.outdent()
w('')
w(' public %s(int w) {' % className)
w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1)
w(' }')
if 0:
w('')
w('@Override')
w('public int size() { // this can now move up?')
w(' return %d*(w+1);' % (len(stateMap2) - 1))
w('}')
w('')
w('@Override')
w('public int getPosition(int absState) { // this can now move up?')
w(' return absState % (w+1);')
w('}')
w('')
w('@Override')
w('public boolean isAccept(int absState) { // this can now move up?')
w(' // decode absState -> state, offset')
w(' int state = absState/(w+1);')
w(' if (true || state < minErrors.length) {')
w(' int offset = absState%(w+1);')
w(' assert offset >= 0;')
w(' return w - offset + minErrors[state] <= %d;' % n)
w(' } else {')
w(' return false;')
w(' }')
w('}')
if MODE == 'array' and PACKED:
# we moved into super class
if False:
w('')
v = 2
l = []
for i in range(63):
l.append(hex(v - 1))
v *= 2
w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1)
w('')
# unpack in java
w('private int unpack(long[] data, int index, int bitsPerValue) {')
w(' final long bitLoc = bitsPerValue * index;')
w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD)
w(' final int bitStart = (int) (bitLoc & %d);' % (WORD - 1))
w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);')
w(' if (bitStart + bitsPerValue <= %d) {' % WORD)
w(' // not split')
w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);')
w(' } else {')
w(' // split')
w(' final int part = %d-bitStart;' % WORD)
w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +')
w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1)
w(' }')
w('}')
# class
w('}')
w('')
fileOut = '%s.java' % className
s = str(w)
for sub, repl in subs:
s = s.replace(sub, repl)
open(fileOut, 'w', encoding='utf-8').write(s)
print('Wrote %s [%d lines; %.1f KB]' % \
(fileOut, len(w.l), os.path.getsize(fileOut) / 1024.))
def renderList(l):
lx = [' ']
for i in range(len(l)):
if i > 0:
lx.append(',')
if i % 4 == 0:
lx.append('\n ')
lx.append(l[i])
return '{\n%s\n }' % ''.join(lx)
MASKS = []
v = 2
for i in range(63):
MASKS.append(v - 1)
v *= 2
# packs into longs; returns long[], numBits
def pack(l):
maxV = max(l)
bitsPerValue = max(1, int(math.ceil(math.log(maxV + 1) / math.log(2.0))))
bitsLeft = WORD
pendingValue = 0
packed = []
for i in range(len(l)):
v = l[i]
if pendingValue > 0:
bitsUsed = math.ceil(math.log(pendingValue) / math.log(2.0))
assert bitsUsed <= (WORD - bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD - bitsLeft, bitsUsed)
if bitsLeft >= bitsPerValue:
pendingValue += v << (WORD - bitsLeft)
bitsLeft -= bitsPerValue
if bitsLeft == 0:
packed.append(pendingValue)
bitsLeft = WORD
pendingValue = 0
else:
# split
# bottom bitsLeft go in current word:
pendingValue += (v & MASKS[bitsLeft - 1]) << (WORD - bitsLeft)
packed.append(pendingValue)
pendingValue = v >> bitsLeft
bitsLeft = WORD - (bitsPerValue - bitsLeft)
if bitsLeft < WORD:
packed.append(pendingValue)
# verify(l, packed, bitsPerValue)
return packed, bitsPerValue
def verify(data, packedData, bitsPerValue):
for i in range(len(data)):
assert data[i] == unpack(packedData, i, bitsPerValue)
def unpack(data, index, bitsPerValue):
bitLoc = bitsPerValue * index
dataLoc = int(bitLoc >> LOG2_WORD)
bitStart = int(bitLoc & (WORD - 1))
if bitStart + bitsPerValue <= WORD:
# not split
return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue - 1]))
else:
# split
part = WORD - bitStart;
return int((((data[dataLoc] >> bitStart) & MASKS[part - 1]) +
((data[1 + dataLoc] & MASKS[bitsPerValue - part - 1]) << part)))
if __name__ == '__main__':
if not __debug__:
print()
print('ERROR: please run without -O')
print()
sys.exit(1)
main()