fis-gtm/sr_port/dfa_calc.c

479 lines
17 KiB
C

/****************************************************************
* *
* Copyright 2001, 2009 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "copy.h"
#include "patcode.h"
#include "compiler.h"
#include "gtm_string.h"
/* the following macro checks that a 1 dimensional array reference is valid i.e. array[index] is within defined limits */
#define check_1dim_array_bound(array, index) assert((index) < (SIZEOF(array) / SIZEOF(array[0])))
/* the following macro checks that a 2 dimensional array reference is valid i.e. array[row][col] is within defined limits */
#define check_2dim_array_bound(array, row, col) \
{ \
assert((row) < (SIZEOF(array) / SIZEOF(array[0]))); \
assert((col) < (SIZEOF(array[0]) / SIZEOF(array[0][0]))); \
}
/* Note: in various places, dfa_calc() makes a reference to the array 'typemask'. dfa_calc() is executed at compile-time.
* The content of the array typemask is static, but, at run-time, the pointer that is used to access the typemask array
* (pattern_typemask) may change whenever a program executes the command View "PATCODE":tablename.
* As a result, the pattern masks that the GT.M compiler uses may differ from the ones that are in operation at run-time.
*/
LITREF uint4 typemask[PATENTS];
static uint4 classmask[CHAR_CLASSES] =
{
PATM_N, PATM_P, PATM_L, PATM_U, PATM_C, PATM_B, PATM_D, PATM_F, PATM_G, PATM_H, PATM_I,
PATM_J, PATM_K, PATM_M, PATM_O, PATM_Q, PATM_R, PATM_S, PATM_T, PATM_V, PATM_W, PATM_X,
PATM_UTF8_ALPHABET, PATM_UTF8_NONBASIC
};
/* This procedure is part of the MUMPS compiler. The function of this procedure is to build the data structures that
* will be used to drive the DFA engine that can evaluate certain pattern matches. Note that this routine operates
* at compile-time, and that all data structures built in this procedure are compiled at the end into a terse string
* of values that will be passed to do_pattern (through patstr). do_pattern(), which operates at run-time will
* interpret this string of values and do the actual DFA work (DFA = Discrete Finite Automaton).
*/
int dfa_calc(struct leaf *leaves, int leaf_num, struct e_table *expand, uint4 **fstchar_ptr, uint4 **outchar_ptr)
{
uint4 *locoutchar;
uint4 pattern_mask;
unsigned char *textstring;
int offset[2 * (MAX_SYM + 1)];
int pos_offset[CHAR_CLASSES];
int fst[2][2], lst[2][2];
int4 charcls, maskcls, numexpand, count, clsnum, maxcls, clsposlis;
int4 state_num, node_num, sym_num, expseq, seq;
struct node nodes;
/* EdM: comment for reviewers:
* 'states' is currently defined as a boolean_t.
* In the original version it was a bool (== char).
* Since comparisons on this array are done using
* memcmp, and the only values assigned to elements
* in this array are TRUE and FALSE (1 and 0),
* we might consider declaring states as a 'char'
* array after all...
*/
boolean_t states[2 * MAX_SYM][CHAR_CLASSES];
boolean_t fpos[2 * MAX_SYM][CHAR_CLASSES];
int d_trans[2 * MAX_SYM][CHAR_CLASSES];
int pos_lis[2 * MAX_SYM][CHAR_CLASSES];
struct c_trns_tb c_trans;
/* Note: in various places, this procedure makes a reference to the array 'typemask'.
* This procedure is executed at compile-time. The contents of the array typemask is static, but, at
* run-time, the pointer that is used to access the array pattern_typemask may change whenever a program
* executes the command View "PATCODE":tablename. As a result, the pattern masks that the GT.M compiler
* uses may differ from the ones that are in operation at run-time.
*/
locoutchar = *outchar_ptr;
if (leaf_num > 1)
{
pattern_mask = PATM_DFA;
state_num = 1;
check_1dim_array_bound(leaves->nullable, leaf_num);
leaves->nullable[leaf_num] = FALSE;
leaves->letter[leaf_num][0] = DFABIT;
leaves->letter[leaf_num][1] = -1;
pos_offset[0] = 0;
for (seq = 1; seq < CHAR_CLASSES; seq++)
pos_offset[seq] = pos_offset[seq - 1] + expand->num_e[seq - 1];
memset(&nodes, 0, SIZEOF(nodes));
memset(&fpos[0][0], 0, SIZEOF(fpos));
memset(&states[0][0], 0, SIZEOF(states));
memset(&d_trans[0][0], 128, SIZEOF(d_trans));
memset(&pos_lis[0][0], 128, SIZEOF(pos_lis));
memset(&c_trans.c[0], 0, SIZEOF(c_trans.c));
memset(offset, 0, SIZEOF(offset));
memset(fst, 0, SIZEOF(fst));
memset(lst, 0, SIZEOF(lst));
charcls = 0;
clsnum = 0;
maxcls = 0;
nodes.nullable[0] = leaves->nullable[0] & leaves->nullable[1];
states[state_num][charcls] = TRUE;
for (maskcls = 0; leaves->letter[0][maskcls] >= 0; maskcls++)
{
check_1dim_array_bound(leaves->letter[0], maskcls);
if (!(leaves->letter[0][maskcls] & DFABIT))
{
check_2dim_array_bound(fpos, charcls, charcls + 1);
fpos[charcls][charcls + 1] = TRUE;
lst[FST][FST] = charcls;
lst[FST][LST] = charcls;
assert(leaves->letter[0][maskcls] >= 0 && leaves->letter[0][maskcls] < SIZEOF(typemask));
seq = patmaskseq(typemask[leaves->letter[0][maskcls]]);
if (seq < 0)
seq = 0;
for (numexpand = 1; expand->meta_c[seq][numexpand] != leaves->letter[0][maskcls]; numexpand++)
;
check_1dim_array_bound(pos_lis, (pos_offset[seq] + numexpand));
for (count = 0; pos_lis[pos_offset[seq] + numexpand][count] >= 0; count++)
;
check_2dim_array_bound(pos_lis, (pos_offset[seq] + numexpand), count);
pos_lis[pos_offset[seq] + numexpand][count] = charcls;
charcls++;
} else
{
seq = patmaskseq(leaves->letter[0][maskcls]);
if (seq < 0)
{
seq = 0;
expseq = 0;
} else
expseq = expand->num_e[seq];
for (numexpand = 0; numexpand < expseq; numexpand++)
{
states[state_num][charcls] = TRUE;
fst[FST][LST] = charcls;
lst[FST][LST] = charcls;
for (count = 0; pos_lis[pos_offset[seq] + numexpand][count] >= 0; count++)
;
check_2dim_array_bound(pos_lis, (pos_offset[seq] + numexpand), count);
pos_lis[pos_offset[seq] + numexpand][count] = charcls;
charcls++;
}
}
}
fst[LST][FST] = charcls;
fst[LST][LST] = charcls;
lst[LST][FST] = charcls;
if(!leaves->nullable[1])
{
nodes.last[0][charcls] = TRUE;
maxcls = charcls;
}
for (maskcls = 0; leaves->letter[1][maskcls] >= 0; maskcls++)
{
check_1dim_array_bound(leaves->letter[1], maskcls);
if (!(leaves->letter[1][maskcls] & DFABIT))
{
check_2dim_array_bound(fpos, charcls, charcls + 1);
fpos[charcls][charcls + 1] = TRUE;
lst[LST][FST] = charcls;
lst[LST][LST] = charcls;
assert(leaves->letter[1][maskcls] >= 0 && leaves->letter[1][maskcls] < SIZEOF(typemask));
seq = patmaskseq(typemask[leaves->letter[1][maskcls]]);
if (seq < 0)
seq = 0;
for (numexpand = 1; expand->meta_c[seq][numexpand] != leaves->letter[1][maskcls]; numexpand++)
;
check_1dim_array_bound(pos_lis, (pos_offset[seq] + numexpand));
for (count = 0; pos_lis[pos_offset[seq] + numexpand][count] >= 0; count++)
;
check_2dim_array_bound(pos_lis, (pos_offset[seq] + numexpand), count);
pos_lis[pos_offset[seq] + numexpand][count] = charcls;
charcls++;
} else
{
seq = patmaskseq(leaves->letter[1][maskcls]);
if (seq < 0)
{
seq = 0;
expseq = 0;
} else
expseq = expand->num_e[seq];
for (numexpand = 0; numexpand < expseq; numexpand++)
{
nodes.last[0][charcls] = TRUE;
fst[LST][LST] = charcls;
lst[LST][LST] = charcls;
check_1dim_array_bound(pos_lis, (pos_offset[seq] + numexpand));
for (count = 0; pos_lis[pos_offset[seq] + numexpand][count] >= 0; count++)
;
check_2dim_array_bound(pos_lis, (pos_offset[seq] + numexpand), count);
pos_lis[pos_offset[seq] + numexpand][count] = charcls;
charcls++;
}
}
}
if (leaves->nullable[0])
{
assert((2 * MAX_SYM) > lst[FST][LST]);
assert(CHAR_CLASSES > fst[LST][LST]);
for (numexpand = lst[FST][FST]; numexpand <= lst[FST][LST]; numexpand++)
{
for (count = fst[FST][FST]; count <= fst[FST][LST]; count++)
{
check_2dim_array_bound(fpos, numexpand, count);
fpos[numexpand][count] = TRUE;
}
}
for (numexpand = fst[LST][FST]; numexpand <= fst[LST][LST]; numexpand++)
states[state_num][numexpand] = TRUE;
}
if (leaves->nullable[1])
{
nodes.last[0][charcls - 1] = TRUE;
for (numexpand = lst[LST][FST]; numexpand <= lst[LST][LST]; numexpand++)
{
for (count = fst[LST][FST]; count <= fst[LST][LST]; count++)
{
check_2dim_array_bound(fpos, numexpand, count);
fpos[numexpand][count] = TRUE;
}
}
for (numexpand = lst[FST][FST]; numexpand <= lst[FST][LST]; numexpand++)
nodes.last[0][numexpand] = TRUE;
maxcls = charcls;
}
for (numexpand = lst[FST][FST]; numexpand <= lst[FST][LST]; numexpand++)
{
for (count = fst[LST][FST]; count <= fst[LST][LST]; count++)
{
check_2dim_array_bound(fpos, numexpand, count);
fpos[numexpand][count] = TRUE;
}
}
if (!leaves->nullable[1])
clsnum = lst[LST][FST];
for (node_num = 1; node_num < leaf_num; node_num++)
{
nodes.nullable[node_num] = nodes.nullable[node_num - 1] &
leaves->nullable[node_num + 1];
if (leaves->nullable[node_num + 1])
{
for (maskcls = 0; maskcls < charcls; maskcls++)
{
check_2dim_array_bound(nodes.last, node_num, maskcls);
nodes.last[node_num][maskcls] = nodes.last[node_num - 1][maskcls];
}
} else
{
nodes.last[node_num][charcls] = TRUE;
maxcls = charcls;
}
fst[LST][FST] = charcls;
fst[LST][LST] = charcls;
lst[LST][FST] = charcls;
for (maskcls = 0; leaves->letter[node_num + 1][maskcls] >= 0; maskcls++)
{
check_1dim_array_bound(leaves->letter[node_num + 1], maskcls);
if (!(leaves->letter[node_num + 1][maskcls] & DFABIT))
{
check_2dim_array_bound(fpos, charcls, charcls + 1);
fpos[charcls][charcls + 1] = TRUE;
lst[LST][FST] = charcls;
lst[LST][LST] = charcls;
assert(leaves->letter[node_num + 1][maskcls] >= 0 &&
leaves->letter[node_num + 1][maskcls] < SIZEOF(typemask));
seq = patmaskseq(typemask[leaves->letter[node_num + 1][maskcls]]);
if (seq < 0)
seq = 0;
for (numexpand = 1;
expand->meta_c[seq][numexpand] != leaves->letter[node_num + 1][maskcls];
numexpand++)
;
check_1dim_array_bound(pos_lis, (pos_offset[seq] + numexpand));
for (count = 0; pos_lis[pos_offset[seq] + numexpand][count] >= 0; count++)
;
check_2dim_array_bound(pos_lis, (pos_offset[seq] + numexpand), count);
pos_lis[pos_offset[seq] + numexpand][count] = charcls;
charcls++;
} else
{
seq = patmaskseq(leaves->letter[node_num + 1][maskcls]);
if (seq < 0)
{
seq = 0;
expseq = 0;
} else
expseq = expand->num_e[seq];
for (numexpand = 0; numexpand < expseq; numexpand++)
{
nodes.last[node_num][charcls] = TRUE;
if (nodes.nullable[node_num - 1])
states[state_num][charcls] = TRUE;
fst[LST][LST] = charcls;
lst[LST][LST] = charcls;
for (count = 0; pos_lis[pos_offset[seq] + numexpand][count] >= 0; count++)
;
check_2dim_array_bound(pos_lis, (pos_offset[seq] + numexpand), count);
pos_lis[pos_offset[seq] + numexpand][count] = charcls;
charcls++;
}
}
}
if (nodes.nullable[node_num - 1])
{
for (numexpand = fst[LST][FST]; numexpand <= fst[LST][LST]; numexpand++)
{
check_1dim_array_bound(states[state_num], numexpand);
states[state_num][numexpand] = TRUE;
}
}
if (leaves->nullable[node_num + 1])
{
nodes.last[node_num][charcls - 1] = TRUE;
for (numexpand = lst[LST][FST]; numexpand <= lst[LST][LST]; numexpand++)
{
for (count = fst[LST][FST]; count <= fst[LST][LST]; count++)
{
check_2dim_array_bound(fpos, numexpand, count);
fpos[numexpand][count] = TRUE;
}
}
maxcls = charcls;
}
for (numexpand = clsnum; numexpand < maxcls; numexpand++)
for (count = fst[LST][FST]; count <= fst[LST][LST]; count++)
if (nodes.last[node_num - 1][numexpand])
fpos[numexpand][count] = TRUE;
if (!leaves->nullable[node_num + 1])
clsnum = lst[LST][FST];
}
sym_num = charcls;
state_num++;
check_1dim_array_bound(offset, state_num + 1);
for (seq = 1; seq < state_num; seq++)
{
charcls = 0;
offset[seq + 1]++;
offset[seq + 1] += offset[seq];
for (maskcls = 0; maskcls < CHAR_CLASSES; maskcls++)
{
if (expand->num_e[maskcls] > 0)
{
for (numexpand = 0; numexpand < expand->num_e[maskcls]; numexpand++)
{
for (maxcls = 0; pos_lis[charcls + numexpand][maxcls] >= 0; maxcls++)
{
clsposlis = pos_lis[charcls + numexpand][maxcls];
if (states[seq][clsposlis])
{
for (clsnum = 0; clsnum <= sym_num; clsnum++)
{
check_1dim_array_bound(states[state_num], clsnum);
states[state_num][clsnum] |= fpos[clsposlis][clsnum];
}
}
}
check_1dim_array_bound(states, state_num);
for (count = 0;
memcmp(states[count], states[state_num], (sym_num + 1) * SIZEOF(boolean_t))
&& (count < state_num);
count++)
;
if (count > 0)
{
if (0 == numexpand)
{
d_trans[seq][charcls] = count;
for (clsnum = 0;
(clsnum < c_trans.c[seq])
&& (c_trans.trns[seq][clsnum] != count);
clsnum++)
;
check_1dim_array_bound(c_trans.p_msk[seq], clsnum);
if (clsnum == c_trans.c[seq])
{
c_trans.p_msk[seq][clsnum] = classmask[maskcls];
check_1dim_array_bound(c_trans.trns[seq], clsnum);
c_trans.trns[seq][clsnum] = count;
offset[seq + 1] += 2;
c_trans.c[seq]++;
} else
c_trans.p_msk[seq][clsnum] |= classmask[maskcls];
} else if (d_trans[seq][charcls] != count)
{
d_trans[seq][charcls + numexpand] = count;
offset[seq + 1] += 3;
}
if (count == state_num)
state_num++;
else
memset(states[state_num], 0, (sym_num + 1) * SIZEOF(states[0][0]));
}
}
charcls += expand->num_e[maskcls];
}
}
}
*outchar_ptr += offset[state_num] + 2;
if ((*outchar_ptr - *fstchar_ptr > MAX_DFA_SPACE) ||
((offset[state_num] + 1) > (MAX_PATTERN_LENGTH / 2)))
return -1;
*locoutchar++ = PATM_DFA;
*locoutchar++ = offset[state_num];
for (seq = 1; seq < state_num; seq++)
{
charcls = 0;
for (numexpand = 0; numexpand < CHAR_CLASSES; numexpand++)
{
if (expand->num_e[numexpand] > 1)
{
for (count = 1; count < expand->num_e[numexpand]; count++)
{
check_2dim_array_bound(d_trans, seq, charcls + count);
if (d_trans[seq][charcls + count] >= 0)
{
*locoutchar++ = PATM_STRLIT;
*locoutchar++ = expand->meta_c[numexpand][count];
*locoutchar++ = offset[d_trans[seq][charcls + count]];
}
}
}
charcls += expand->num_e[numexpand];
}
for (numexpand = 0; numexpand < c_trans.c[seq]; numexpand++)
{
check_2dim_array_bound(c_trans.p_msk, seq, numexpand);
*locoutchar++ = c_trans.p_msk[seq][numexpand];
check_2dim_array_bound(c_trans.trns, seq, numexpand);
check_1dim_array_bound(offset, c_trans.trns[seq][numexpand]);
*locoutchar++ = offset[c_trans.trns[seq][numexpand]];
}
*locoutchar++ = (states[seq][sym_num]) ? PATM_ACS : PATM_DFA;
}
assert(MAX_DFA_SPACE >= (locoutchar - *fstchar_ptr));
return 1;
} else
{
pattern_mask = 0;
*outchar_ptr += 1;
maskcls = 1;
if (!(leaves->letter[0][0] & DFABIT))
{
pattern_mask = PATM_STRLIT;
for (maskcls = 0; leaves->letter[0][maskcls] >= 0; maskcls++)
;
check_1dim_array_bound(leaves->letter[0], maskcls);
*outchar_ptr += PAT_STRLIT_PADDING + ((maskcls + SIZEOF(uint4) - 1) / SIZEOF(uint4));
} else
{
for (numexpand = 0; leaves->letter[0][numexpand] >= 0; numexpand++)
pattern_mask |= leaves->letter[0][numexpand];
check_1dim_array_bound(leaves->letter[0], numexpand);
}
if (*outchar_ptr - *fstchar_ptr > MAX_PATTERN_LENGTH)
return -1;
*locoutchar++ = pattern_mask;
if (PATM_STRLIT & pattern_mask)
{
*locoutchar++ = maskcls; /* bytelen */
*locoutchar++ = maskcls; /* charlen */
*locoutchar++ = 0; /* both NONASCII and BADCHAR flags are absent since this is
indeed a valid ASCII string or else we would not have come
to dfa_calc (and hence there are no bad chars) */
assert(3 == PAT_STRLIT_PADDING);
textstring = (unsigned char *)locoutchar; /* change pointer type */
for (numexpand = 0; numexpand < maskcls; numexpand++)
*textstring++ = leaves->letter[0][numexpand];
}
return maskcls;
}
}